%%capture
%pip install -r ../requirements.txtAML Challenge 2024
Datenübersicht / ERM:

Setup
Die folgenden Code-Blöcke können genutzt werden, um die benötigten Abhängigkeiten zu installieren und zu importieren.
from itables import show
from itables import init_notebook_mode
init_notebook_mode()# Flag for debug mode
debug_mode = False# Laden der eingesetzten Libraries
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from IPython.display import display
from itables import init_notebook_mode
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
GridSearchCV,
StratifiedKFold,
)
from sklearn.metrics import (
roc_curve,
auc,
make_scorer,
confusion_matrix,
ConfusionMatrixDisplay,
fbeta_score,
cohen_kappa_score,
matthews_corrcoef,
)
import lime.lime_tabular
import shap
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from tqdm import tqdm
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel%%capture
# set theme ggplot for plots
plt.style.use("ggplot")
# set display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)# Funktion zur Bestimmung des Geschlechts und Berechnung des Geburtstags
def parse_details(birth_number):
birth_number_str = str(
birth_number
) # Konvertiere birth_number zu einem String, falls notwendig
year_prefix = "19"
month = int(birth_number_str[2:4])
gender = "female" if month > 12 else "male"
if gender == "female":
month -= 50
year = int(year_prefix + birth_number_str[:2])
day = int(birth_number_str[4:6])
birth_day = datetime(year, month, day)
return gender, birth_day
# Berechnung des Alters basierend auf einem Basisjahr
def calculate_age(birth_date, base_date=datetime(1999, 12, 31)):
return (
base_date.year
- birth_date.year
- ((base_date.month, base_date.day) < (birth_date.month, birth_date.day))
)
# Regression metrics
def regression_results(y_true, y_pred):
print(
"explained_variance: ",
round(metrics.explained_variance_score(y_true, y_pred), 4),
)
print(
"mean_squared_log_error: ",
round(metrics.mean_squared_log_error(y_true, y_pred), 4),
)
print("r2: ", round(metrics.r2_score(y_true, y_pred), 4))
print("MAE: ", round(metrics.mean_absolute_error(y_true, y_pred), 4))
print("MSE: ", round(metrics.mean_squared_error(y_true, y_pred), 4))
print("RMSE: ", round(np.sqrt(metrics.mean_squared_error(y_true, y_pred)), 4))1. Aufgabenstellung
Inhalt der hier bearbeiteten und dokumentierten Mini-Challenge für das Modul «aml - Angewandtes Machine Learning» der FHNW ist die Entwicklung und Evaluierung von Affinitätsmodellen für personalisierte Kreditkarten-Werbekampagnen im Auftrag einer Bank. Das Ziel der Authoren ist es also, mithilfe von Kunden- und Transaktionsdaten präzise Modelle zu erstellen, die die Wahrscheinlichkeit des Kreditkartenkaufs einer bestimmten Person vorhersagen.
2. Laden der zur Verfügung gestellten Daten
Zur Verfügung gestellt wurden 8 csv-Dateien von welchen die Beschreibung der erfassten Variablen unter dem folgenden Link eingesehen werden können: PKDD’99 Discovery Challenge - Guide to the Financial Data Set. Nachfolgend werden diese csv-Dateien eingelesen.
account = pd.read_csv("./data/account.csv", sep=";", dtype={"date": "str"})
card = pd.read_csv("./data/card.csv", sep=";", dtype={"issued": "str"})
client = pd.read_csv("./data/client.csv", sep=";")
disp = pd.read_csv("./data/disp.csv", sep=";")
district = pd.read_csv("./data/district.csv", sep=";")
loan = pd.read_csv("./data/loan.csv", sep=";", dtype={"date": "str"})
order = pd.read_csv("./data/order.csv", sep=";")
trans = pd.read_csv("./data/trans.csv", sep=";", dtype={"date": "str", "bank": "str"})3. Datenaufbereitung & Explorative Datenanalyse
Im folgenden Abschnitt werden die geladenen Daten separat so transformiert, dass jede Zeile einer Observation und jede Spalte einer Variable im entsprechenden Datenformat entspricht, also ins Tidy-Format gebracht.
data_frames = {}Account
Der Datensatz accounts.csv beinhaltet 4500 Observationen mit den folgenden Informationen über die Kontos der Bank:
account_id: die Kontonummer,
district_id: den Standort der entsprechenden Bankfiliale,
frequency: die Frequenz der Ausstellung von Kontoauszügen (monatlich, wöchentlich, pro Transaktion) und
date: das Erstellungsdatum
account.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 account_id 4500 non-null int64
1 district_id 4500 non-null int64
2 frequency 4500 non-null object
3 date 4500 non-null object
dtypes: int64(2), object(2)
memory usage: 140.8+ KB
print("Anzahl fehlender Werte:", sum(account.isnull().sum()))
print("Anzahl duplizierter Einträge:", account.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Nachfolgend wird die date Spalte des account.csv-Datensatzes in das entsprechende Datenformat geparsed und die Werte von frequency übersetzt und als Levels einer Kategorie definiert.
# parse date
account["date"] = pd.to_datetime(account["date"], format="%y%m%d")
# translate categories
account["frequency"] = account["frequency"].replace(
{
"POPLATEK MESICNE": "monthly",
"POPLATEK TYDNE": "weekly",
"POPLATEK PO OBRATU": "transactional",
}
)
# convert column frequency to categorical
account["frequency"] = account["frequency"].astype("category")
# sample 5 random rows
account.sample(n=5)| account_id | district_id | frequency | date | |
|---|---|---|---|---|
| 2145 | 11079 | 16 | monthly | 1995-11-10 |
| 1208 | 1237 | 9 | monthly | 1994-03-13 |
| 3636 | 3304 | 36 | monthly | 1997-01-16 |
| 704 | 3220 | 29 | monthly | 1993-08-14 |
| 1240 | 2886 | 1 | monthly | 1994-04-08 |
Distrikt
Hier zu sehen ist die Verteilung der Distrikte pro Bankkonto. Ersichtlich ist, dass im Distrikt 1 mit Abstand am meisten Bankkontos geführt werden. Die darauf folgenden Distrikte bewegen sich alle im Bereich zwischen ~250 - 50 Bankkonten.
# plot the distribution of the district_ids and replace the id with it's name
plt.figure(figsize=(15, 6))
account["district_id"].value_counts().plot(kind="bar")
plt.title("Verteilung der Distrikte")
plt.xlabel("Distrikt")
plt.ylabel("Anzahl")
plt.show()
Frequenz
Auf dieser Visualisierung zu sehen ist die Klassenverteilung der Frequenz der Ausstellung der Kontoauszüge. Die allermeisten Bankkonten besitzen eine monatliche Ausstellung.
# Verteilung der Frequenz visualisieren
plt.figure(figsize=(10, 6))
account["frequency"].value_counts().plot(kind="bar")
plt.title("Frequenz der Kontoauszüge")
plt.xlabel("Frequenz")
plt.ylabel("Anzahl")
plt.show()
Datum
Der hier dargestellte Plot zeigt die Verteilung der Kontoerstellungsdaten. Das erste Konto wurde im Jahr 1993 und das neuste im 1998 erstellt.
# plot date distribution
plt.figure(figsize=(10, 6))
plt.hist(account["date"], bins=20)
plt.title("Verteilung der Kontoerstellungsdaten")
plt.xlabel("Datum")
plt.ylabel("Anzahl")
plt.show()
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
# append account data to dataframe collection
data_frames["account.csv"] = account# %%capture
# # generate sweetviz report
# svReport_account = sv.analyze(account)
# svReport_account.show_html(filepath="./reports/accounts.html", open_browser=False)Card
Der Datensatz card.csv beinhaltet 892 Observationen mit den folgenden Informationen über die von der Bank herausgegebenen Kreditkarten:
card_id: die Kartennummer,
disp_id: die Zuordnung zum entsprechenden Bankkonto und -inhaber (Disposition),
type: die Art der Kreditkarte (junior, classic, gold) und
issued: das Ausstellungsdatum
card.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 card_id 892 non-null int64
1 disp_id 892 non-null int64
2 type 892 non-null object
3 issued 892 non-null object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB
print("Anzahl fehlender Werte:", sum(card.isnull().sum()))
print("Anzahl duplizierter Einträge:", card.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Auch bei diesem Datensatz (card.csv) werden zunächst die Datentypen korrigiert um anschliessend die Inhalte entsprechend beschreiben zu können
# parse date
card["issued"] = pd.to_datetime(card["issued"].str[:6], format="%y%m%d")
card["issued"] = card["issued"].dt.to_period("M")
# convert type to categorical
card["type"] = card["type"].astype("category")
card.sample(n=5)| card_id | disp_id | type | issued | |
|---|---|---|---|---|
| 547 | 421 | 2644 | classic | 1998-04 |
| 802 | 746 | 4901 | classic | 1998-11 |
| 471 | 1239 | 13442 | junior | 1998-02 |
| 159 | 854 | 6753 | classic | 1996-10 |
| 651 | 1076 | 10604 | gold | 1998-07 |
Kartentyp
Hier dargestellt ist die Klassenverteilung der Kartentypen. Die meisten Karteninhaber besitzen eine klassische Kreditkarte, gefolgt von ~180 junior- und ~100 gold Karten.
# plot distribution of type
plt.figure(figsize=(10, 6))
card["type"].value_counts().plot(kind="bar")
plt.title("Verteilung der Kartentypen")
plt.xlabel("Kartentyp")
plt.ylabel("Anzahl")
plt.show()
Ausstellungsdatum
Hier dargestellt ist die Häufigkeit von Kreditkartenausstellungen pro Monat. Erkennbar ist eine steigende Tendenz mit einem Rückgang in den Monaten Februar - April 1997.
# plot issued date per month and year
plt.figure(figsize=(15, 6))
card["issued"].value_counts().sort_index().plot(kind="bar")
plt.title("Verteilung der Ausstellungsdaten")
plt.xlabel("Datum")
plt.ylabel("Anzahl")
plt.show()
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
# append to dataframes collection
data_frames["card.csv"] = card# %%capture
# # generate sweetviz report
# svReport_card = sv.analyze(card)
# svReport_card.show_html(filepath="./reports/card.html", open_browser=False)Client
Der Datensatz client.csv beinhaltet 5369 Observationen mit den folgenden Informationen über die Kunden der Bank:
client_id: die Kundennummer,
birth_number: eine Kombination aus Geburtsdatum und Geschlecht sowie
district_id: die Adresse
client.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 client_id 5369 non-null int64
1 birth_number 5369 non-null int64
2 district_id 5369 non-null int64
dtypes: int64(3)
memory usage: 126.0 KB
print("Anzahl fehlender Werte:", sum(client.isnull().sum()))
print("Anzahl duplizierter Einträge:", client.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Die Spalte birth_number des client.csv-Datensatzes codiert 3 Features der Bankkunden: Geschlecht, Geburtsdatum und damit auch das Alter. Diese Informationen werden mithilfe der zuvor definierten Funktionen parse_details() und calculate_age extrahiert.
# Geburtstag & Geschlecht aus birth_number extrahieren
client["gender"], client["birth_day"] = zip(
*client["birth_number"].apply(parse_details)
)
client["gender"] = client["gender"].astype("category")
# Alter berechnen
client["age"] = client["birth_day"].apply(calculate_age)
# Spalte birth_number entfernen
client = client.drop(columns=["birth_number"])
# Sample 5 random rows
client.sample(n=5)| client_id | district_id | gender | birth_day | age | |
|---|---|---|---|---|---|
| 1023 | 1078 | 24 | female | 1978-12-15 | 21 |
| 2306 | 2433 | 32 | male | 1964-06-19 | 35 |
| 1834 | 1945 | 54 | male | 1954-06-03 | 45 |
| 5296 | 13044 | 52 | male | 1976-03-01 | 23 |
| 1167 | 1227 | 33 | male | 1955-10-02 | 44 |
Geschlecht
Hier dargestellt ist die Verteilung des Geschlechts der Bankkunden. Das Geschlecht der erfassten Bankkunden ist fast gleichverteilt mit einem etwas kleineren Frauenanteil.
# plot distribution of gender
plt.figure(figsize=(10, 6))
gender_distribution = client["gender"].value_counts().plot(kind="bar")
plt.title("Verteilung des Geschlechts der Bankkunden")
plt.xlabel("Geschlecht")
plt.ylabel("Anzahl")
plt.show()
Alter
Nachfolgend abgebildet ist die Verteilung des Alters der Bankkunden. Die jüngste erfasste Person ist 12 Jahre alt und die älteste 88.
# plot distribution of age
plt.figure(figsize=(10, 6))
client["age"].plot(kind="hist", bins=20)
plt.title("Verteilung des Alters der Bankkunden")
plt.xlabel("Alter")
plt.ylabel("Anzahl")
plt.show()
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
data_frames["client.csv"] = client# %%capture
# svReport_client = sv.analyze(client)
# svReport_client.show_html(filepath="./reports/client.html", open_browser=False)Disp
Der Datensatz disp.csv beinhaltet 5369 Observationen mit den folgenden Informationen über die Dispositionen der Bank:
disp_id: der Identifikationsschlüssel der Disposition,
client_id: die Kundennummer,
account_id: die Kontonummer,
type: die Art der Disposition (Inhaber, Benutzer)
disp.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 disp_id 5369 non-null int64
1 client_id 5369 non-null int64
2 account_id 5369 non-null int64
3 type 5369 non-null object
dtypes: int64(3), object(1)
memory usage: 167.9+ KB
print("Anzahl fehlender Werte:", sum(disp.isnull().sum()))
print("Anzahl duplizierter Einträge:", disp.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Auch die Variablen des Datensatzes disp.csv werden in die korrekten Datentypen übertragen.
# Spalte type als Kategorie speichern
disp["type"] = disp["type"].astype("category")
# random sample
disp.sample(n=5)| disp_id | client_id | account_id | type | |
|---|---|---|---|---|
| 4416 | 4672 | 4672 | 3875 | OWNER |
| 4826 | 6473 | 6473 | 5362 | OWNER |
| 2223 | 2349 | 2349 | 1934 | OWNER |
| 2134 | 2254 | 2254 | 1858 | OWNER |
| 4788 | 6067 | 6067 | 5024 | OWNER |
Typ der Disposition
Hier dargestellt ist die Verteilung der Art der Dispositionen. 4500 Kunden sind Inhaber eines Kontos und 896 sind Disponenten.
# plot distribution of kind
plt.figure(figsize=(10, 6))
disp["type"].value_counts().plot(kind="bar")
plt.title("Verteilung der Dispositionen")
plt.xlabel("Disposition")
plt.ylabel("Anzahl")
plt.show()
# remove disponents
disp = disp[disp["type"] == "OWNER"]Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
data_frames["disp.csv"] = disp# %%capture
# svReport_disp = sv.analyze(disp)
# svReport_disp.show_html(filepath="./reports/disp.html", open_browser=False)District
Der Datensatz district.csv beinhaltet 77 Observationen mit den folgenden demografischen Informationen:
A1: die ID des Distrikts,
A2: der Name des Distrikts,
A3: die Region,
A4: die Anzahl der Einwohner,
A5: die Anzahl der Gemeinden mit < 499 Einwohner,
A6: die Anzahl der Gemeinden mit 500 - 1999 Einwohner,
A7: die Anzahl der Gemeinden mit 2000 - 9999 Einwohner,
A8: die Anzahl der Gemeinden mit >10000 Einwohner,
A9: die Anzahl Städte,
A10: das Verhältnis von städtischen Einwohnern,
A11: das durchschnittliche Einkommen,
A12: die Arbeitslosenrate vom Jahr 95,
A13: die Arbeitslosenrate vom Jahr 96,
A14: die Anzahl von Unternehmer pro 1000 Einwohner,
A15: die Anzahl von begangenen Verbrechen im Jahr 95,
A16: die Anzahl von begangenen Verbrechen im Jahr 96,
district.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A1 77 non-null int64
1 A2 77 non-null object
2 A3 77 non-null object
3 A4 77 non-null int64
4 A5 77 non-null int64
5 A6 77 non-null int64
6 A7 77 non-null int64
7 A8 77 non-null int64
8 A9 77 non-null int64
9 A10 77 non-null float64
10 A11 77 non-null int64
11 A12 77 non-null object
12 A13 77 non-null float64
13 A14 77 non-null int64
14 A15 77 non-null object
15 A16 77 non-null int64
dtypes: float64(2), int64(10), object(4)
memory usage: 9.8+ KB
print("Anzahl fehlender Werte:", sum(district.isnull().sum()))
print("Anzahl duplizierter Einträge:", district.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Zunächst werden die Spaltennamen in sprechendere übersetzt.
# Spalten umbenennen
district = district.rename(
columns={
"A1": "district_id",
"A2": "district_name",
"A3": "region",
"A4": "num_of_habitat",
"A5": "num_of_small_town",
"A6": "num_of_medium_town",
"A7": "num_of_big_town",
"A8": "num_of_bigger_town",
"A9": "num_of_city",
"A10": "ratio_of_urban",
"A11": "average_salary",
"A12": "unemploy_rate95",
"A13": "unemploy_rate96",
"A14": "n_of_enterpren_per1000_inhabit",
"A15": "no_of_crimes95",
"A16": "no_of_crimes96",
}
)[
[
"district_id",
"district_name",
"region",
"num_of_habitat",
"num_of_small_town",
"num_of_medium_town",
"num_of_big_town",
"num_of_bigger_town",
"num_of_city",
"ratio_of_urban",
"average_salary",
"unemploy_rate95",
"unemploy_rate96",
"n_of_enterpren_per1000_inhabit",
"no_of_crimes95",
"no_of_crimes96",
]
]
district["region"] = district["region"].astype("category")
district["district_name"] = district["district_name"].astype("category")Auffällig ist, dass nebst den Spalten A2 (dem Namen) und A3 (der Region) die Spalten A12 und A15 den Datentyp object erhalten. Das ist, weil jeweils ein fehlender Wert vorhanden ist, welcher mit einem ? gekennzeichnet ist.
# die fehlenden Werte anzeigen
district[district.isin(["?"]).any(axis=1)]| district_id | district_name | region | num_of_habitat | num_of_small_town | num_of_medium_town | num_of_big_town | num_of_bigger_town | num_of_city | ratio_of_urban | average_salary | unemploy_rate95 | unemploy_rate96 | n_of_enterpren_per1000_inhabit | no_of_crimes95 | no_of_crimes96 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 68 | 69 | Jesenik | north Moravia | 42821 | 4 | 13 | 5 | 1 | 3 | 48.4 | 8173 | ? | 7.01 | 124 | ? | 1358 |
Wir gehen davon aus, dass es sich hier um effektiv fehlende Werte handelt und nicht um zensierte Daten, also Werte, für welche der exakte Wert fehlt, aber trotzdem Informationen vorhanden sind. In diesem Fall, wenn die Variable mit den fehlenden Werten eine hohe Korrelation mit anderen Prediktoren aufweist, bietet es sich an, KNN oder eine einfache lineare Regression für die Imputation anzuwenden. (Branco, Torgo, and Ribeiro 2017)
Die Korrelationsmatrix des SweetViz Reports zeigt, dass unemploy_rate95 stark mit unemploy_rate96 und no_of_crimes95 mit no_of_crimes96 korreliert.
# die ? ersetzen mit NaN
district = district.replace("?", np.nan)
# Datentyp korrigieren
district["no_of_crimes95"] = district["no_of_crimes95"].astype(float)
district["unemploy_rate95"] = district["unemploy_rate95"].astype(float)# Korrelation zwischen Arbeitslosenquote 95 und 96
district[["unemploy_rate95", "unemploy_rate96"]].corr()| unemploy_rate95 | unemploy_rate96 | |
|---|---|---|
| unemploy_rate95 | 1.000000 | 0.981521 |
| unemploy_rate96 | 0.981521 | 1.000000 |
# Korrelation zwischen Anzahl Verbrechen 95 und 96
district[["no_of_crimes95", "no_of_crimes96"]].corr()| no_of_crimes95 | no_of_crimes96 | |
|---|---|---|
| no_of_crimes95 | 1.000000 | 0.998426 |
| no_of_crimes96 | 0.998426 | 1.000000 |
Demnach werden nachfolgend zwei lineare Regressions-Modelle trainiert, um die fehlenden Werte zu imputieren.
# Zeilen filtern, sodass keine fehlenden Werte vorhanden sind
district_no_na = district[district["unemploy_rate95"].notnull()]
# Lineares regressions Modell erstellen
lin_reg_unemploy = LinearRegression()
# Modell fitten
lin_reg_unemploy.fit(
district_no_na["unemploy_rate96"].values.reshape(-1, 1),
district_no_na["unemploy_rate95"].values,
)
# Modell evaluieren
regression_results(
district_no_na["unemploy_rate95"],
lin_reg_unemploy.predict(district_no_na["unemploy_rate96"].values.reshape(-1, 1)),
)explained_variance: 0.9634
mean_squared_log_error: 0.0051
r2: 0.9634
MAE: 0.231
MSE: 0.1002
RMSE: 0.3166
Der \(R^2\) Wert von \(0.9634\) versichert, damit ein stabiles Modell für die Imputation erreicht zu haben.
# Lineares regressions Modell erstellen
lin_reg_crime = LinearRegression()
# Modell fitten
lin_reg_crime.fit(
district_no_na["no_of_crimes96"].values.reshape(-1, 1),
district_no_na["no_of_crimes95"].values,
)
# Modell evaluieren
regression_results(
district_no_na["no_of_crimes95"],
lin_reg_crime.predict(district_no_na["no_of_crimes96"].values.reshape(-1, 1)),
)explained_variance: 0.9969
mean_squared_log_error: 0.0219
r2: 0.9969
MAE: 383.5379
MSE: 303529.5111
RMSE: 550.9351
Auch hier mit einem \(R^2\) Wert von \(0.9969\) gehen wir davon aus, damit ein stabiles Modell für die Imputation erreicht zu haben. Somit werden nachfolgend die beiden Modelle genutzt, um die fehlenden Werte einzufüllen.
# Vorhersage der fehlenden Werte
district.loc[district["no_of_crimes95"].isnull(), "no_of_crimes95"] = (
lin_reg_crime.predict(
district[district["no_of_crimes95"].isnull()]["no_of_crimes96"].values.reshape(
-1, 1
)
)
)
district.loc[district["unemploy_rate95"].isnull(), "unemploy_rate95"] = (
lin_reg_unemploy.predict(
district[district["unemploy_rate95"].isnull()][
"unemploy_rate96"
].values.reshape(-1, 1)
)
)district.sample(n=5)| district_id | district_name | region | num_of_habitat | num_of_small_town | num_of_medium_town | num_of_big_town | num_of_bigger_town | num_of_city | ratio_of_urban | average_salary | unemploy_rate95 | unemploy_rate96 | n_of_enterpren_per1000_inhabit | no_of_crimes95 | no_of_crimes96 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 13 | Rakovnik | central Bohemia | 53921 | 61 | 22 | 1 | 1 | 2 | 41.3 | 8598 | 2.77 | 3.26 | 123 | 1597.0 | 1875 |
| 58 | 59 | Kromeriz | south Moravia | 108871 | 41 | 29 | 7 | 2 | 6 | 62.1 | 8444 | 3.24 | 3.47 | 106 | 2595.0 | 2305 |
| 26 | 27 | Plzen - jih | west Bohemia | 67298 | 71 | 19 | 10 | 0 | 7 | 43.8 | 8561 | 0.65 | 1.29 | 110 | 1029.0 | 1127 |
| 60 | 61 | Trebic | south Moravia | 117897 | 139 | 28 | 5 | 1 | 6 | 53.8 | 8814 | 4.76 | 5.74 | 107 | 2112.0 | 2059 |
| 33 | 34 | Chomutov | north Bohemia | 125236 | 28 | 11 | 1 | 4 | 5 | 87.7 | 9675 | 6.43 | 7.68 | 100 | 5323.0 | 5190 |
district.isnull().sum()district_id 0
district_name 0
region 0
num_of_habitat 0
num_of_small_town 0
num_of_medium_town 0
num_of_big_town 0
num_of_bigger_town 0
num_of_city 0
ratio_of_urban 0
average_salary 0
unemploy_rate95 0
unemploy_rate96 0
n_of_enterpren_per1000_inhabit 0
no_of_crimes95 0
no_of_crimes96 0
dtype: int64
EDA
Es gibt keine Duplikate und somit 77 unterschiedliche Namen der Distrikte. Diese sind auf 8 Regionen verteilt, wobei die meisten in south Moravia und die wenigsten in Prague liegen. Der Distrikt mit den wenigsten Einwohnern zählt 42821, im Vergleich zu demjenigen mit den meisten: 1204953, wobei die nächst kleinere Ortschaft 102609 Einwohner zählt. Weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
data_frames["district.csv"] = district# %%capture
# svReport_district = sv.analyze(district)
# svReport_district.show_html(filepath="./reports/district.html", open_browser=False)Loan
Der Datensatz loan.csv beinhaltet 682 Observationen mit den folgenden Informationen über die vergebenen Darlehen der Bank:
loan_id: ID des Darlehens,
account_id: die Kontonummer,
date: das Datum, wann das Darlehen gewährt wurde,
amount: der Betrag,
duration: die Dauer des Darlehens,
payments: die höhe der monatlichen Zahlungen und
status: der Rückzahlungsstatus (A: ausgeglichen, B: Vertrag abgelaufen aber nicht fertig bezahlt, C: laufender Vertrag und alles in Ordnung, D: laufender Vertrag und Kunde verschuldet)
loan.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 loan_id 682 non-null int64
1 account_id 682 non-null int64
2 date 682 non-null object
3 amount 682 non-null int64
4 duration 682 non-null int64
5 payments 682 non-null float64
6 status 682 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 37.4+ KB
print("Anzahl fehlender Werte:", sum(loan.isnull().sum()))
print("Anzahl duplizierter Einträge:", loan.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Auch für den loan.csv Datensatz werden zunächst Datenformate korrigiert und Kategorien übersetzt. Anschliessend wird überprüft, ob ein Bankkonto mehrere Darlehen besitzt.
# Datum parsen
loan["date"] = pd.to_datetime(loan["date"], format="%y%m%d")
# Kategorien übersetzen
loan["status"] = loan["status"].map(
{
"A": "contract finished",
"B": "finished contract, loan not paid",
"C": "running contract",
"D": "client in debt",
}
)
loan["status"] = loan["status"].astype("category")# Anzahl der Darlehen pro Kontonummer berechnen
num_of_loan_df = (
loan.groupby("account_id")
.size()
.reset_index(name="num_of_loan")
.sort_values(by="num_of_loan", ascending=False)
)# Überprüfen, ob jedes Konto nur ein Darlehen hat
num_of_loan_df["num_of_loan"].value_counts()num_of_loan
1 682
Name: count, dtype: int64
Von allen Bankkontos, die ein Darlehen aufgenommen haben, hat jedes Konto genau ein Darlehen zugewiesen.
# Sample 5 random rows from the joined DataFrame
display(loan.sample(n=5))| loan_id | account_id | date | amount | duration | payments | status | |
|---|---|---|---|---|---|---|---|
| 493 | 6437 | 7052 | 1997-11-09 | 63972 | 36 | 1777.0 | client in debt |
| 533 | 6486 | 7240 | 1998-01-15 | 152160 | 60 | 2536.0 | running contract |
| 346 | 6415 | 6950 | 1997-02-12 | 475680 | 48 | 9910.0 | running contract |
| 374 | 5210 | 1252 | 1997-04-01 | 148140 | 36 | 4115.0 | running contract |
| 231 | 6137 | 5574 | 1996-04-10 | 44628 | 12 | 3719.0 | contract finished |
Ausstellungsdatum
Nachfolgend dargestellt ist die Verteilung der Darlehensausstellungsdaten. das erste Darlehen wurde im Juli 1993 ausgestellt und das neuste im Dezember 1998.
# plot distribution of date
plt.figure(figsize=(15, 6))
loan["date"].dt.to_period("M").value_counts().sort_index().plot(kind="bar")
plt.title("Verteilung der Darlehensausstellungsdaten")
plt.xlabel("Datum")
plt.ylabel("Anzahl")
plt.show()
Dauer
Hier ersichtlich ist die Verteilung der Dauer der Darlehen. Sie ist fast gleichverteilt über die 5 möglichen Optionen.
# plot duration distribution
plt.figure(figsize=(10, 6))
loan["duration"].value_counts().plot(kind="bar")
plt.title("Verteilung der Darlehensdauer")
plt.xlabel("Dauer")
plt.ylabel("Anzahl")
plt.show()
Betrag
Hier dargestellt ist die Verteilung der Darlehensbeträge. Nur wenige Darlehensbeträge sind höher als 400000 wobei die meisten um die 100000 betragen.
# plot amount
plt.figure(figsize=(10, 6))
loan["amount"].plot(kind="hist", bins=20)
plt.title("Verteilung der Darlehensbeträge")
plt.xlabel("Betrag")
plt.ylabel("Anzahl")
plt.show()
Status
Der nachfolgende Plot zeigt die Klassenverteilung vom Darlehensstatus. Die meisten (~400) sind laufend und ok, rund 200 sind abgeschlossen, die Kunden von ~50 Darlehen sind verschuldet und etwas weniger wurden abgeschlossen, ohne fertig abbezahlt worden zu sein.
# plot status distribution
plt.figure(figsize=(10, 6))
loan["status"].value_counts().plot(kind="bar")
plt.title("Verteilung der Darlehensstatus")
plt.xlabel("Status")
plt.ylabel("Anzahl")
plt.show()
Zahlungen
Hier ersichtlich ist die Verteilung der monatlichen Zahlungen der Darlehen. Die kleinste monatliche Zahlung beträgt 304 und die höchste 9910.
# plot payments
plt.figure(figsize=(10, 6))
loan["payments"].plot(kind="hist", bins=20)
plt.title("Verteilung der monatlichen Zahlungen")
plt.xlabel("Zahlungen")
plt.ylabel("Anzahl")
plt.show()
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
# Assign the resulting DataFrame to a dictionary for storage
data_frames["loan.csv"] = loan# %%capture
# svReport_loan = sv.analyze(loan)
# svReport_loan.show_html(filepath="./reports/loan.html", open_browser=False)Order
Der Datensatz order.csv beinhaltet 6471 Observationen mit den folgenden Informationen über die Daueraufträge eines Kontos:
order_id: die Nummer des Dauerauftrags,
account_id: die Kontonummer von welchem der Auftrag stammt,
bank_to: die empfangende Bank,
account_to: das empfangende Konto,
amount: der Betrag,
k_symbol: die Art des Auftrags (Versicherungszahlung, Haushalt, Leasing, Darlehen)
order.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6471 entries, 0 to 6470
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 6471 non-null int64
1 account_id 6471 non-null int64
2 bank_to 6471 non-null object
3 account_to 6471 non-null int64
4 amount 6471 non-null float64
5 k_symbol 6471 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 303.5+ KB
print("Anzahl fehlender Werte:", sum(order.isnull().sum()))
print("Anzahl duplizierter Einträge:", order.duplicated().sum())Anzahl fehlender Werte: 0
Anzahl duplizierter Einträge: 0
Aufbereitung
Auch für order.csv werden die Kategorien zunächst übersetzt und fehlende Werte mit der Kategorie unknown ersetzt. Es bestehen deutlich mehr Daueraufträge als Bankkontos, was darauf hindeutet, dass ein Bankkonto mehrere Daueraufträge eingerichtet haben kann. Zur weiteren Verarbeitung der Daten wird das Format so geändert, dass pro Konto ein order-Eintrag existiert.
# Kategorien übersetzen und fehlende Werte mit "unknown" füllen
order["k_symbol"] = (
order["k_symbol"]
.map(
{
"POJISTNE": "insurance_payment",
"SIPO": "household",
"UVER": "loan_payment",
"LEASING": "leasing",
}
)
.fillna("unknown")
)
order["k_symbol"] = order["k_symbol"].astype("category")# Merge with 'account_id_df' to ensure all accounts are represented
order = pd.merge(account[["account_id"]], order, on="account_id", how="left")
# After merging, fill missing values that may have been introduced
order["k_symbol"] = order["k_symbol"].fillna("unknown")
order["amount"] = order["amount"].fillna(0)
order["has_order"] = ~order.isna().any(axis=1)
orders_pivot = order.pivot_table(
index="account_id",
columns="k_symbol",
values="amount",
aggfunc="sum",
observed=False,
)
# Add prefix to column names
orders_pivot.columns = orders_pivot.columns
orders_pivot = orders_pivot.reset_index()
# NaN to 0
orders_pivot = orders_pivot.fillna(0)
# Sample 5 random rows from the merged DataFrame
orders_pivot.sample(n=5)| k_symbol | account_id | household | insurance_payment | leasing | loan_payment | unknown |
|---|---|---|---|---|---|---|
| 3611 | 3802 | 3276.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3150 | 3312 | 813.0 | 330.0 | 0.0 | 0.0 | 821.0 |
| 176 | 187 | 2798.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3639 | 3832 | 160.0 | 0.0 | 0.0 | 0.0 | 4176.0 |
| 4162 | 7011 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Empfangende Bank
Die Verteilung der empfangenden Banken ist ziemlich ausgeglichen, wobei in 742 Observationen diese Angabe fehlt.
Empfangendes Konto
Auch bei den empfangenden Konten scheint es keine auffällige Konzentration bei wenigen Konten zu geben und bei 742 Observationen fehlt die Angabe ebenfalls.
Betrag
Der Betrag bewegt sich im Bereich zwischen 0 - 14882 mit einem Mittelwert von 2943 und einem Median von 2249. Die Verteilung ist also stark rechtsschief
Art
Die meisten Daueraufträge sind betreffend dem Haushalt eingerichtet worden (3502), die wenigsten für Leasing (341).
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
# Assuming data_frames is a dictionary for storing DataFrames
data_frames["order.csv"] = orders_pivot# %%capture
# svReport_order = sv.analyze(order)
# svReport_order.show_html(filepath="./reports/order.html", open_browser=False)Trans
Der Datensatz trans.csv beinhaltet 1056320 Observationen mit den folgenden Informationen über die Transaktionen eines Kontos:
trans_id: die ID der Transaktion,
account_id: die Kontonummer des ausführenden Kontos,
date: das Datum,
type: der Typ (Einzahlung, Bezug),
operation: die Art der Transaktion (Bezug Kreditkarte, Bareinzahlung, Bezug über eine andere Bank, Bezug Bar, Überweisung)
amount: der Betrag der Transaktion,
balance: der Kontostand nach ausführung der Transaktion,
k_symbol: die Klassifikation der Transaktion (Versicherungszahlung, Kontoauszug, Zinsauszahlung, Zinszahlung bei negativem Kontostand, Haushalt, Pension, Darlehensauszahlung),
bank: die empfangende Bank und
account: das empfangende Bankkonto
trans.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 trans_id 1056320 non-null int64
1 account_id 1056320 non-null int64
2 date 1056320 non-null object
3 type 1056320 non-null object
4 operation 873206 non-null object
5 amount 1056320 non-null float64
6 balance 1056320 non-null float64
7 k_symbol 574439 non-null object
8 bank 273508 non-null object
9 account 295389 non-null float64
dtypes: float64(3), int64(2), object(5)
memory usage: 80.6+ MB
print("Anzahl fehlender Werte:", sum(trans.isnull().sum()))
print("Anzahl duplizierter Einträge:", trans.duplicated().sum())Anzahl fehlender Werte: 2208738
Anzahl duplizierter Einträge: 0
Aufbereitung
Die Kategorien für type, operation und k_symbol wurden übersetzt und die Datentypen korrigiert.
trans["date"] = pd.to_datetime(trans["date"], format="%y%m%d")
# Update 'type' column
trans["type"] = trans["type"].replace(
{"PRIJEM": "credit", "VYDAJ": "withdrawal", "VYBER": "withdrawal"}
)
trans["type"] = trans["type"].astype("category")
# Update 'operation' column
trans["operation"] = trans["operation"].replace(
{
"VYBER KARTOU": "credit card withdrawal",
"VKLAD": "credit in cash",
"PREVOD Z UCTU": "collection from another bank",
"VYBER": "cash withdrawal",
"PREVOD NA UCET": "remittance to another bank",
}
)
trans["operation"] = trans["operation"].astype("category")
# Update 'k_symbol' column
trans["k_symbol"] = trans["k_symbol"].replace(
{
"POJISTNE": "insurance payment",
"SLUZBY": "statement payment",
"UROK": "interest credited",
"SANKC. UROK": "sanction interest if negative balance",
"SIPO": "household payment",
"DUCHOD": "pension credited",
"UVER": "loan payment",
}
)
trans["k_symbol"] = trans["k_symbol"].astype("category")
# negate the amount if type is credit
trans.loc[trans["type"] == "withdrawal", "amount"] = trans.loc[
trans["type"] == "withdrawal", "amount"
] * (-1)# Sample 5 random rows from the DataFrame
trans.sample(n=5)| trans_id | account_id | date | type | operation | amount | balance | k_symbol | bank | account | |
|---|---|---|---|---|---|---|---|---|---|---|
| 802547 | 112089 | 378 | 1998-03-05 | withdrawal | cash withdrawal | -5617.0 | 28666.6 | NaN | NaN | NaN |
| 545359 | 1154147 | 3948 | 1997-05-06 | withdrawal | remittance to another bank | -314.0 | 40300.2 | household payment | YZ | 3428841.0 |
| 156001 | 697168 | 2384 | 1995-04-10 | credit | collection from another bank | 47906.0 | 110776.4 | NaN | KL | 13283962.0 |
| 399169 | 3021568 | 10019 | 1996-10-12 | withdrawal | remittance to another bank | -5415.0 | 64533.0 | loan payment | QR | 87297737.0 |
| 505809 | 482260 | 1642 | 1997-03-08 | withdrawal | remittance to another bank | -3597.0 | 12514.1 | household payment | UV | 27920514.0 |
Zeitliche Entwicklung eines Kontos
# Plot Zeitliche Entwicklung des Konto-Saldos für die Konto nummer 19
account_19 = trans[trans["account_id"] == 19].copy() # Create a copy of the DataFrame
# Ensure the date column is in datetime format
account_19["date"] = pd.to_datetime(account_19["date"])
# Sort the values by date
account_19 = account_19.sort_values("date")
plt.figure(figsize=(10, 6))
plt.plot(account_19["date"], account_19["balance"])
plt.title("Time evolution of balance for account number 19")
plt.xlabel("Date")
plt.ylabel("Balance")
plt.show()
# zoom the year 1995 of the plot
account_19_1995 = account_19[account_19["date"].dt.year == 1995]
# plot it
plt.figure(figsize=(10, 6))
plt.plot(account_19_1995["date"], account_19_1995["balance"])
plt.title("Time evolution of balance for account number 19 in 1995")
plt.xlabel("Date")
plt.ylabel("Balance")
plt.show()
# Wee see that there is a steep line in 1995-10 so there are two transactions, this we have to clean.
Korrelation & weitere Informationen
Die Korrelation sowie weitere Informationen zu den vorhandenen Daten können aus dem SweetViz Report entnommen werden.
# Assign to a dictionary if needed (similar to list assignment in R)
data_frames["trans.csv"] = trans# %%capture
# svReport_trans = sv.analyze(trans)
# svReport_trans.show_html(filepath="./reports/trans.html", open_browser=False)4. Kombinieren der Daten zu einem Modellierungsdatensatz
Im nachfolgenden Abschnitt werden die Daten zu statischen (Kunden-) Daten und transaktionellen (Bankdienstleistungs-) Daten kombiniert um diese anschliessend zu einem Datensatz für die Modellierung zusammenzufügen.
Stammdaten
Die aufbereiteten Stammdaten aus den Dateien
disp.csv
account.csv
client.csv
card.csv
loan.csv
order.csv
districts.csv
werden nachfolgend zu einem Datensatz kombiniert.
# merge dataframes
static_data = (
data_frames["disp.csv"]
.merge(data_frames["account.csv"], on="account_id", validate="1:1", how="left")
.merge(
data_frames["client.csv"],
on="client_id",
validate="1:1",
suffixes=("_account", "_client"),
how="left",
)
.merge(
data_frames["card.csv"],
on="disp_id",
validate="1:1",
suffixes=("_disp", "_card"),
how="left",
)
.merge(
data_frames["loan.csv"],
on="account_id",
suffixes=("_account", "_loan"),
validate="1:1",
how="left",
)
.merge(data_frames["order.csv"], on="account_id", validate="1:1", how="left")
.merge(
data_frames["district.csv"].add_suffix("_account"),
left_on="district_id_account",
right_on="district_id_account",
validate="m:1",
how="left",
)
.merge(
data_frames["district.csv"].add_suffix("_client"),
left_on="district_id_client",
right_on="district_id_client",
validate="m:1",
how="left",
)
)static_data["has_card"] = ~static_data["card_id"].isna()static_data["type_card"] = static_data["type_card"].cat.add_categories(["none"])
static_data.loc[static_data["card_id"].isna(), "type_card"] = "none"# get static_data status categories
static_data["status"] = static_data["status"].cat.add_categories(["none"])
static_data.loc[static_data["status"].isna(), "status"] = "none"static_data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 disp_id 4500 non-null int64
1 client_id 4500 non-null int64
2 account_id 4500 non-null int64
3 type_disp 4500 non-null category
4 district_id_account 4500 non-null int64
5 frequency 4500 non-null category
6 date_account 4500 non-null datetime64[ns]
7 district_id_client 4500 non-null int64
8 gender 4500 non-null category
9 birth_day 4500 non-null datetime64[ns]
10 age 4500 non-null int64
11 card_id 892 non-null float64
12 type_card 4500 non-null category
13 issued 892 non-null period[M]
14 loan_id 682 non-null float64
15 date_loan 682 non-null datetime64[ns]
16 amount 682 non-null float64
17 duration 682 non-null float64
18 payments 682 non-null float64
19 status 4500 non-null category
20 household 4500 non-null float64
21 insurance_payment 4500 non-null float64
22 leasing 4500 non-null float64
23 loan_payment 4500 non-null float64
24 unknown 4500 non-null float64
25 district_name_account 4500 non-null category
26 region_account 4500 non-null category
27 num_of_habitat_account 4500 non-null int64
28 num_of_small_town_account 4500 non-null int64
29 num_of_medium_town_account 4500 non-null int64
30 num_of_big_town_account 4500 non-null int64
31 num_of_bigger_town_account 4500 non-null int64
32 num_of_city_account 4500 non-null int64
33 ratio_of_urban_account 4500 non-null float64
34 average_salary_account 4500 non-null int64
35 unemploy_rate95_account 4500 non-null float64
36 unemploy_rate96_account 4500 non-null float64
37 n_of_enterpren_per1000_inhabit_account 4500 non-null int64
38 no_of_crimes95_account 4500 non-null float64
39 no_of_crimes96_account 4500 non-null int64
40 district_name_client 4500 non-null category
41 region_client 4500 non-null category
42 num_of_habitat_client 4500 non-null int64
43 num_of_small_town_client 4500 non-null int64
44 num_of_medium_town_client 4500 non-null int64
45 num_of_big_town_client 4500 non-null int64
46 num_of_bigger_town_client 4500 non-null int64
47 num_of_city_client 4500 non-null int64
48 ratio_of_urban_client 4500 non-null float64
49 average_salary_client 4500 non-null int64
50 unemploy_rate95_client 4500 non-null float64
51 unemploy_rate96_client 4500 non-null float64
52 n_of_enterpren_per1000_inhabit_client 4500 non-null int64
53 no_of_crimes95_client 4500 non-null float64
54 no_of_crimes96_client 4500 non-null int64
55 has_card 4500 non-null bool
dtypes: bool(1), category(9), datetime64[ns](3), float64(18), int64(24), period[M](1)
memory usage: 1.6 MB
print("Anzahl duplizierter Einträge:", static_data.duplicated().sum())Anzahl duplizierter Einträge: 0
# fillna for payments, duration, amount
static_data["payments"] = static_data["payments"].fillna(0)
static_data["duration"] = static_data["duration"].fillna(0)
static_data["amount"] = static_data["amount"].fillna(0)Damit wird ein Datensatz mit 4500 individuellen Kunden und 56 Spalten erzeugt. 892 dieser Kunden besitzen eine Kreditkarte und 682 haben einen Kredit aufgenommen.
Entfernen der Junior Karteninhaber
Kunden im jugendlichen Alter sind speziell interessante Kunden für eine Bank, da diese grundsätzlich noch keine bis wenige Bankdienstleistungen beziehen und somit flexibel sind. Es ist deshalb sehr vorteilhaft für ein Unternehmen diese für sich zu gewinnen, weshalb viele Banken für solche Kunden ganz spezifische Prozesse definieren. Das in dieser Aufgabenstellung gewünschte Modell würde in so einem Prozess nicht eingesetzt werden, weshalb die jugendlichen Kunden nachfolgend aus dem Datensatz entfernt werden.
num_accounts_before = len(static_data)
# # Filter rows where 'card_type' does not contain 'junior' (case insensitive)
static_data = static_data[
~static_data["type_card"].str.contains("junior", case=False, na=False)
]
num_accounts_after = len(static_data)
num_junior_cards = num_accounts_before - num_accounts_after
print(f"Number of junior cards removed: {num_junior_cards}")Number of junior cards removed: 145
Durch diese Entfernung wurden 145 Kunden entfernt.
Bewegungsdaten
Um einen Datensatz zu erhalten, bei welchem jede Zeile eine Observation repräsentiert müssen die Transaktionen pro Kunde entsprechend aufgerollt werden. Das bedeutet, ein vordefiniertes Zeitfenster vor dem zu modellierenden Event zu definieren und die darin enthaltenen Daten in einer Zeile zu aggregieren. Das gesuchte Zeitfenster beinhaltet bestenfalls saisonale Gegebenheiten und stets einen Lag-Zeitraum, der die Verzögerung der Kaufentscheidung und Ausführung des Auftrags aufzeichnen soll. Hier wird ein Rollup-Fenster inklusive Lag von 13 Monaten eingesetzt.
Käufer
Für Kunden, die bereits eine Kreditkarte besitzen, ist es unkompliziert, das Rollup-Fenster zu identifizieren.
# select all transactions from trans from date 1995-03-16 and account_id 150
trans[(trans["date"] == "1995-03-16") & (trans["account_id"] == 150)]| trans_id | account_id | date | type | operation | amount | balance | k_symbol | bank | account | |
|---|---|---|---|---|---|---|---|---|---|---|
| 148637 | 44703 | 150 | 1995-03-16 | credit | credit in cash | 900.0 | 2800.0 | NaN | NaN | NaN |
| 148638 | 44714 | 150 | 1995-03-16 | credit | credit in cash | 1900.0 | 1900.0 | NaN | NaN | NaN |
Aus dieser Tabelle ersichtlich ist, dass für den Kunden 150 zum Datum der Eröffnung des Kontos mehrere Transaktionen vorhanden sind und dass wenn die Beträge von dem Tag aufsummiert werden, der korrekte Kontostand resultiert (1900 + 900 = 2800). Deshalb wird nachfolgend davon ausgegangen, dass die Aufsummierung der Transaktionsbeträge zum korrekten Kontostand führt.
# sort dataframe trans by account_id and date
first_row_per_account = trans.groupby("account_id")
# select rows where amount == balance
first_row_per_account = first_row_per_account.apply(
lambda x: x[x["amount"] == x["balance"]].iloc[0], include_groups=False
).reset_index()# show that there's one row per unique account_id in trans
first_row_per_account["account_id"].nunique() == trans["account_id"].nunique()True
first_row_per_account.query("amount != balance")| account_id | trans_id | date | type | operation | amount | balance | k_symbol | bank | account |
|---|
Mit dem obigen Code wird zudem sichergestellt, dass diese Gegebenheit für alle Kunden gilt. Nachfolgend werden die Transaktionen aggregiert, sodass die Spalten
volume: das Volumen, also die Summe der Ein- und Ausgaben auf dem Konto,
credit: die Summe der Einnahmen,
withdrawal: die Summe der Ausgaben,
n_transactions: die Anzahl der getätigten Transaktionen und
balance: der Kontostand
pro Monat entstehen. Dieser Datensatz wird dann mithilfe der nachfolgend definierten Funktion rollup_credit_card aufgerollt.
# Extract year and month from date to a new column 'year_month'
trans["year_month"] = trans["date"].dt.to_period("M")
# Group by 'account_id' and 'month', and calculate the sum of 'amount', 'credit', 'withdrawal' and 'n_transactions'
transactions_monthly = (
trans.groupby(["account_id", "year_month"])
.agg(
volume=("amount", "sum"),
credit=("amount", lambda x: x[x > 0].sum()),
withdrawal=("amount", lambda x: x[x < 0].sum()),
n_transactions=("amount", "count"),
)
.reset_index()
)
# Calculate cumulative sum of 'volume' for each account
transactions_monthly["balance"] = transactions_monthly.groupby("account_id")[
"volume"
].cumsum()# count unique account_ids in transactions_monthly
print(transactions_monthly["account_id"].nunique())
num_accounts_before = transactions_monthly["account_id"].nunique()4500
def rollup_credit_card(trans_monthly, account_card_issue_dates):
# Add issue date and calculate months since card issue
trans_monthly = pd.merge(trans_monthly, account_card_issue_dates, on="account_id")
trans_monthly["months_before_card_issue"] = [
(issued - year_month).n
for issued, year_month in zip(
trans_monthly["issued"], trans_monthly["year_month"]
)
]
# select only where months_before_card_issue > 0 and <= 13
trans_monthly = trans_monthly[
(trans_monthly["months_before_card_issue"] > 0)
& (trans_monthly["months_before_card_issue"] <= 13)
]
trans_monthly = trans_monthly.groupby("account_id").filter(lambda x: len(x) == 13)
# Pivot wider
trans_monthly = trans_monthly.pivot_table(
index="account_id",
columns="months_before_card_issue",
values=["volume", "credit", "withdrawal", "n_transactions", "balance"],
)
trans_monthly.reset_index(inplace=True)
trans_monthly.columns = [
"_".join(str(i) for i in col) for col in trans_monthly.columns
]
# rename account_id_ to account_id
trans_monthly = trans_monthly.rename(columns={"account_id_": "account_id"})
return trans_monthlybuyers = static_data[static_data["has_card"]]
# print number of buyers
print(buyers["account_id"].nunique())747
transactions_rolled_up_buyers = rollup_credit_card(
transactions_monthly, buyers.loc[:, ["account_id", "issued"]]
)
transactions_rolled_up_buyers.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 66 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 account_id 568 non-null int64
1 balance_1 568 non-null float64
2 balance_2 568 non-null float64
3 balance_3 568 non-null float64
4 balance_4 568 non-null float64
5 balance_5 568 non-null float64
6 balance_6 568 non-null float64
7 balance_7 568 non-null float64
8 balance_8 568 non-null float64
9 balance_9 568 non-null float64
10 balance_10 568 non-null float64
11 balance_11 568 non-null float64
12 balance_12 568 non-null float64
13 balance_13 568 non-null float64
14 credit_1 568 non-null float64
15 credit_2 568 non-null float64
16 credit_3 568 non-null float64
17 credit_4 568 non-null float64
18 credit_5 568 non-null float64
19 credit_6 568 non-null float64
20 credit_7 568 non-null float64
21 credit_8 568 non-null float64
22 credit_9 568 non-null float64
23 credit_10 568 non-null float64
24 credit_11 568 non-null float64
25 credit_12 568 non-null float64
26 credit_13 568 non-null float64
27 n_transactions_1 568 non-null float64
28 n_transactions_2 568 non-null float64
29 n_transactions_3 568 non-null float64
30 n_transactions_4 568 non-null float64
31 n_transactions_5 568 non-null float64
32 n_transactions_6 568 non-null float64
33 n_transactions_7 568 non-null float64
34 n_transactions_8 568 non-null float64
35 n_transactions_9 568 non-null float64
36 n_transactions_10 568 non-null float64
37 n_transactions_11 568 non-null float64
38 n_transactions_12 568 non-null float64
39 n_transactions_13 568 non-null float64
40 volume_1 568 non-null float64
41 volume_2 568 non-null float64
42 volume_3 568 non-null float64
43 volume_4 568 non-null float64
44 volume_5 568 non-null float64
45 volume_6 568 non-null float64
46 volume_7 568 non-null float64
47 volume_8 568 non-null float64
48 volume_9 568 non-null float64
49 volume_10 568 non-null float64
50 volume_11 568 non-null float64
51 volume_12 568 non-null float64
52 volume_13 568 non-null float64
53 withdrawal_1 568 non-null float64
54 withdrawal_2 568 non-null float64
55 withdrawal_3 568 non-null float64
56 withdrawal_4 568 non-null float64
57 withdrawal_5 568 non-null float64
58 withdrawal_6 568 non-null float64
59 withdrawal_7 568 non-null float64
60 withdrawal_8 568 non-null float64
61 withdrawal_9 568 non-null float64
62 withdrawal_10 568 non-null float64
63 withdrawal_11 568 non-null float64
64 withdrawal_12 568 non-null float64
65 withdrawal_13 568 non-null float64
dtypes: float64(65), int64(1)
memory usage: 293.0 KB
# calculate the number of buyers lost by rolling up
lost_buyers = buyers[
~buyers["account_id"].isin(transactions_rolled_up_buyers["account_id"])
]
lost_buyers = lost_buyers["account_id"].nunique()
print(lost_buyers)179
Und so entsteht ein Datensatz, welcher für 568 Kreditkartenkäufer die Merkmale balance, credit, n_transactions, volume und withdrawal für alle 13 Monate des Rollup-Fensters inklusive Lag beinhaltet.
Nicht-Käufer
Um die Transaktionen der Nicht-Käufer analog zu verarbeiten, wird ein fiktives Kaufdatum benötigt, welches als Ausgangslage für die Aufrollung dient.
# plot the issue date distribution of the buyers that where rolled up (586)
merged_data = transactions_rolled_up_buyers.merge(
buyers, how="left", left_on="account_id", right_on="account_id"
)
plt.figure(figsize=(10, 6))
merged_data["issued"].value_counts().sort_index().plot(kind="bar")
plt.title("Verteilung der Ausstellungsdaten")
plt.xlabel("Datum")
plt.ylabel("Anzahl")
plt.show()
Hier dargestellt ist die Verteilung der Ausstellungsdaten der Kreditkarten. Zu erkennen ist ein klarer aufwärtstrend.
# from transactions_monthly plot the number of distinct account ids per month
plt.figure(figsize=(10, 6))
transactions_monthly["year_month"].value_counts().sort_index().plot(kind="bar")
plt.title("Anzahl der Konten pro Monat")
plt.xlabel("Monat")
plt.ylabel("Anzahl")
plt.show()
# calculate the correlation between transactions_monthly["year_month"] and merged_data["issued"].value_counts()
transactions_monthly["year_month"].value_counts().sort_index().corr(
merged_data["issued"].value_counts().sort_index()
)0.7837318295574054
Hier dargestellt ist die Verteilung der Anzahl von eröffneten Konten pro Monat. Der hier beobachtete Aufwärtstrend der Anzahl erstellter Konten könnte ein maßgebender Einflussfaktor der Anzahl ausgestellter Kreditkarten pro Monat sein. Der Korrelationskoeffizient von 0.78 unterstreicht diese Beobachtung. Wir gehen davon aus, dass diese Gegebenheit von einem Klassifikationsmodell schnell overfitted wird, weshalb wir nachfolgend ein random sampling einsetzen, um das fiktive Issue-Date von Nicht-Käufern zu definieren.
def rollup_non_credit(trans_monthly, non_buyers, range):
# set seed
np.random.seed(43)
# for each non buyer, find the date of the first transaction
first_transaction_dates = (
trans_monthly.groupby("account_id")["year_month"].min().reset_index()
)
first_transaction_dates.columns = ["account_id", "first_transaction_date"]
# merge the first transaction dates with the non_buyers DataFrame
non_buyers = non_buyers.merge(first_transaction_dates, on="account_id", how="left")
# randomly sample a date from the range as issue date for each non buyer making sure that the random date is after the first transaction of the non buyer
non_buyers["issued"] = non_buyers["first_transaction_date"].apply(
lambda x: np.random.choice(range, 1)[0]
)
non_buyers_rolled_up = rollup_credit_card(
trans_monthly, non_buyers.loc[:, ["account_id", "issued"]]
)
return non_buyers_rolled_up, non_buyers# get the list of issue dates of buyers
issue_dates_buyers = buyers["issued"].unique()
n_non_buyers = static_data[~static_data["has_card"]]["account_id"].nunique()
transactions_rolled_up_non_buyers, non_buyers = rollup_non_credit(
transactions_monthly, static_data[~static_data["has_card"]], issue_dates_buyers
)
transactions_rolled_up_non_buyers.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1684 entries, 0 to 1683
Data columns (total 66 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 account_id 1684 non-null int64
1 balance_1 1684 non-null float64
2 balance_2 1684 non-null float64
3 balance_3 1684 non-null float64
4 balance_4 1684 non-null float64
5 balance_5 1684 non-null float64
6 balance_6 1684 non-null float64
7 balance_7 1684 non-null float64
8 balance_8 1684 non-null float64
9 balance_9 1684 non-null float64
10 balance_10 1684 non-null float64
11 balance_11 1684 non-null float64
12 balance_12 1684 non-null float64
13 balance_13 1684 non-null float64
14 credit_1 1684 non-null float64
15 credit_2 1684 non-null float64
16 credit_3 1684 non-null float64
17 credit_4 1684 non-null float64
18 credit_5 1684 non-null float64
19 credit_6 1684 non-null float64
20 credit_7 1684 non-null float64
21 credit_8 1684 non-null float64
22 credit_9 1684 non-null float64
23 credit_10 1684 non-null float64
24 credit_11 1684 non-null float64
25 credit_12 1684 non-null float64
26 credit_13 1684 non-null float64
27 n_transactions_1 1684 non-null float64
28 n_transactions_2 1684 non-null float64
29 n_transactions_3 1684 non-null float64
30 n_transactions_4 1684 non-null float64
31 n_transactions_5 1684 non-null float64
32 n_transactions_6 1684 non-null float64
33 n_transactions_7 1684 non-null float64
34 n_transactions_8 1684 non-null float64
35 n_transactions_9 1684 non-null float64
36 n_transactions_10 1684 non-null float64
37 n_transactions_11 1684 non-null float64
38 n_transactions_12 1684 non-null float64
39 n_transactions_13 1684 non-null float64
40 volume_1 1684 non-null float64
41 volume_2 1684 non-null float64
42 volume_3 1684 non-null float64
43 volume_4 1684 non-null float64
44 volume_5 1684 non-null float64
45 volume_6 1684 non-null float64
46 volume_7 1684 non-null float64
47 volume_8 1684 non-null float64
48 volume_9 1684 non-null float64
49 volume_10 1684 non-null float64
50 volume_11 1684 non-null float64
51 volume_12 1684 non-null float64
52 volume_13 1684 non-null float64
53 withdrawal_1 1684 non-null float64
54 withdrawal_2 1684 non-null float64
55 withdrawal_3 1684 non-null float64
56 withdrawal_4 1684 non-null float64
57 withdrawal_5 1684 non-null float64
58 withdrawal_6 1684 non-null float64
59 withdrawal_7 1684 non-null float64
60 withdrawal_8 1684 non-null float64
61 withdrawal_9 1684 non-null float64
62 withdrawal_10 1684 non-null float64
63 withdrawal_11 1684 non-null float64
64 withdrawal_12 1684 non-null float64
65 withdrawal_13 1684 non-null float64
dtypes: float64(65), int64(1)
memory usage: 868.4 KB
So finden wir 1684 aufgerollte Datensätze für Nicht-Käufer.
non_buyers_lost = (
n_non_buyers - transactions_rolled_up_non_buyers["account_id"].nunique()
)
print(non_buyers_lost)1924
# plot the issue date distribution of the non-buyers that where rolled up
plt.figure(figsize=(10, 6))
non_buyers["issued"].value_counts().sort_index().plot(kind="bar")
plt.title("Verteilung der Ausstellungsdaten")
plt.xlabel("Datum")
plt.ylabel("Anzahl")
plt.show()
Zusammenfügen der Daten
Nachfolgend werden die Stammdaten mit den aufgerollten Bewegungsdaten zum Modellierungsdatensatz kombiniert.
transactions_rolled_up = pd.concat(
[transactions_rolled_up_buyers, transactions_rolled_up_non_buyers]
)
# merge transactions_rolled_up and static data
X = pd.merge(static_data, transactions_rolled_up, on="account_id")Entfernen von minderjährigen Kunden
Kunden die zum Zeitpunkt des Erwerbs der Kreditkarte minderjährig waren, müssen entfernt werden, da diese Zielgruppe, wie bereits beschrieben, nicht modelliert werden soll.
num_before_underage_removal = X["account_id"].nunique()
X["issued"] = X["issued"].fillna(non_buyers["issued"])
X["issued"] = X["issued"].dt.to_timestamp()
time_to_compare = pd.Timedelta(days=6970)
# Filter underage accounts
X = X[(X["issued"] - X["birth_day"]) >= time_to_compare]
# Calculate the number of accounts after filtering
num_accounts_after = X["account_id"].nunique()
# Calculate the number of underage accounts
num_underage_accounts = num_before_underage_removal - num_accounts_after
num_underage_accounts120
120 Kunden werden durch diesen Schritt entfernt.
y = X["has_card"]
X = X.drop(
columns=[
"has_card",
"card_id",
"issued",
"type_card",
"loan_id",
"date_loan",
"disp_id",
"client_id",
"district_id_account",
"birth_day",
"type_disp",
]
)
# convert "date_account" to "days active"
X["date_account"] = (X["date_account"] - X["date_account"].min()).dt.days# show NaNs in X
X.isnull().sum()account_id 0
frequency 0
date_account 0
district_id_client 0
gender 0
age 0
amount 0
duration 0
payments 0
status 0
household 0
insurance_payment 0
leasing 0
loan_payment 0
unknown 0
district_name_account 0
region_account 0
num_of_habitat_account 0
num_of_small_town_account 0
num_of_medium_town_account 0
num_of_big_town_account 0
num_of_bigger_town_account 0
num_of_city_account 0
ratio_of_urban_account 0
average_salary_account 0
unemploy_rate95_account 0
unemploy_rate96_account 0
n_of_enterpren_per1000_inhabit_account 0
no_of_crimes95_account 0
no_of_crimes96_account 0
district_name_client 0
region_client 0
num_of_habitat_client 0
num_of_small_town_client 0
num_of_medium_town_client 0
num_of_big_town_client 0
num_of_bigger_town_client 0
num_of_city_client 0
ratio_of_urban_client 0
average_salary_client 0
unemploy_rate95_client 0
unemploy_rate96_client 0
n_of_enterpren_per1000_inhabit_client 0
no_of_crimes95_client 0
no_of_crimes96_client 0
balance_1 0
balance_2 0
balance_3 0
balance_4 0
balance_5 0
balance_6 0
balance_7 0
balance_8 0
balance_9 0
balance_10 0
balance_11 0
balance_12 0
balance_13 0
credit_1 0
credit_2 0
credit_3 0
credit_4 0
credit_5 0
credit_6 0
credit_7 0
credit_8 0
credit_9 0
credit_10 0
credit_11 0
credit_12 0
credit_13 0
n_transactions_1 0
n_transactions_2 0
n_transactions_3 0
n_transactions_4 0
n_transactions_5 0
n_transactions_6 0
n_transactions_7 0
n_transactions_8 0
n_transactions_9 0
n_transactions_10 0
n_transactions_11 0
n_transactions_12 0
n_transactions_13 0
volume_1 0
volume_2 0
volume_3 0
volume_4 0
volume_5 0
volume_6 0
volume_7 0
volume_8 0
volume_9 0
volume_10 0
volume_11 0
volume_12 0
volume_13 0
withdrawal_1 0
withdrawal_2 0
withdrawal_3 0
withdrawal_4 0
withdrawal_5 0
withdrawal_6 0
withdrawal_7 0
withdrawal_8 0
withdrawal_9 0
withdrawal_10 0
withdrawal_11 0
withdrawal_12 0
withdrawal_13 0
dtype: int64
Zu sehen ist, dass der Datensatz komplett ist, also keine fehlenden Werte aufweist.
Explorative Datenanalyse des Modellierungssatzes
Nachfolgend werden diverse Askepte des zusammengefügten Datensatzes untersucht ### Entfernte Konten In der Vorverarbeitung werden diverse Kundenkonten aus dem Datensatz entfernt. Die folgende Darstellung zeigt auf, wie viele in welchem Schritt entfernt werden.
num_junior_cards, lost_buyers, non_buyers_lost, num_underage_accounts, X.shape[0](145, 179, 1924, 120, 2132)
waterfall_data = {
"step": [
"Initial",
"Junior Card Holders",
"Lost Buyers",
"Non-Buyers",
"Underage Clients",
"Final",
],
"count": [
4500,
-num_junior_cards,
-lost_buyers,
-non_buyers_lost,
-num_underage_accounts,
X.shape[0],
],
}
waterfall_df = pd.DataFrame(waterfall_data)
blank = waterfall_df["count"].cumsum().shift(1).fillna(0)
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
blank[5] = 0my_plot = waterfall_df.plot(
kind="bar",
stacked=True,
bottom=blank,
legend=False,
title="Entfernte Anzahl von Konten in verschiedenen Aufbereitungsschritten",
)
my_plot.plot(step.index, step.values, "k")
display(waterfall_df)| step | count | |
|---|---|---|
| 0 | Initial | 4500 |
| 1 | Junior Card Holders | -145 |
| 2 | Lost Buyers | -179 |
| 3 | Non-Buyers | -1924 |
| 4 | Underage Clients | -120 |
| 5 | Final | 2132 |
Insgesamt werden also 2368 Konten in der Vorberarbeitung entfernt. ### Verteilung Kartenbesitzer Nachfolgend wird die Verteilung der Kartenbesitzer aufgezeigt.
# plot distribution of has_card
plt.figure(figsize=(10, 6))
y.value_counts().plot(kind="bar")
plt.title("Verteilung der Kartenbesitzer")
plt.xlabel("Kartenbesitzer")
plt.ylabel("Anzahl")
plt.show()
Klar ersichtlich ist, dass es deutlich mehr Nicht-Kartenbesitzer als Kartenbesitzer gibt. Unbalancierten Daten erschweren die Modellierung erheblich, weshalb nachfolgend SMOTE (Synthetic Minority Over-sampling Technique) eingesetzt wird, um die Daten zu balancieren.
from imblearn.over_sampling import SMOTE
# x get dummy variables for category
X = pd.get_dummies(X, drop_first=True)
sm = SMOTE(random_state=43)
X_res, y_res = sm.fit_resample(X, y)# plot distribution of has_card
plt.figure(figsize=(10, 6))
y_res.value_counts().plot(kind="bar")
plt.title("Verteilung der Kartenbesitzer")
plt.xlabel("Kartenbesitzer")
plt.ylabel("Anzahl")
plt.show()
Durch den Einsatz von SMOTE konnte der Datensatz ausbalanciert werden. ### Konten 14 und 18 In der Aufgabenbeschreibung wurde explizit verlangt, die Konten 14 und 18 zu untersuchen. Nachfolgend dargestellt ist der Saldo und das Volumen der beiden Konten.
# Filter the data for accounts 14 and 18
account_data = X[X["account_id"].isin([14, 18])]
# Reshape the DataFrame for easier plotting
months = [f"Month {i}" for i in range(1, 14)]
balances = [f"balance_{i}" for i in range(1, 14)]
volumes = [f"volume_{i}" for i in range(1, 14)]
# Melt the DataFrame for balances and volumes
balance_data = account_data.melt(
id_vars="account_id", value_vars=balances, var_name="Month", value_name="Balance"
)
volume_data = account_data.melt(
id_vars="account_id", value_vars=volumes, var_name="Month", value_name="Volume"
)
# Convert 'Month' from string to integer for proper sorting
balance_data["Month"] = balance_data["Month"].str.extract(r"(\d+)").astype(int)
volume_data["Month"] = volume_data["Month"].str.extract(r"(\d+)").astype(int)
# Sort data by account and month
balance_data = balance_data.sort_values(by=["account_id", "Month"])
volume_data = volume_data.sort_values(by=["account_id", "Month"])
# Plotting balance data
plt.figure(figsize=(14, 7))
for key, grp in balance_data.groupby("account_id"):
plt.plot(grp["Month"], grp["Balance"], label=f"Account {key} Balances")
plt.title("Monthly Balances for Accounts 14 and 18")
plt.xlabel("Month")
plt.ylabel("Balance")
plt.legend()
plt.show()
# Plotting volume data
plt.figure(figsize=(14, 7))
for key, grp in volume_data.groupby("account_id"):
plt.plot(grp["Month"], grp["Volume"], label=f"Account {key} Volumes")
plt.title("Monthly Transaction Volumes for Accounts 14 and 18")
plt.xlabel("Month")
plt.ylabel("Volume")
plt.legend()
plt.show()

Feature Engineering
# Function to calculate features
def calculate_features(df, prefix):
monthly_values = df[[f"{prefix}_{i}" for i in range(1, 13)]]
# needs to be a small constant to avoid division by zero
epsilon = 1e-7 # small constant
features = {
f"{prefix}_mean": monthly_values.mean(axis=1),
f"{prefix}_min": monthly_values.min(axis=1),
f"{prefix}_max": monthly_values.max(axis=1),
f"{prefix}_mad": monthly_values.sub(monthly_values.mean(axis=1), axis=0)
.abs()
.mean(axis=1),
f"{prefix}_mean_ratio_last3_first3": (
monthly_values[[f"{prefix}_{i}" for i in range(10, 13)]].mean(axis=1)
/ (
monthly_values[[f"{prefix}_{i}" for i in range(1, 4)]].mean(axis=1)
+ epsilon
)
),
}
if prefix in ["credit", "withdrawal"]:
features[f"{prefix}_sum"] = monthly_values.sum(axis=1)
if prefix in ["balance", "credit"]:
features[f"{prefix}_std"] = monthly_values.std(axis=1)
return features
# List of column prefixes for required calculations
columns_to_process = ["balance", "credit", "n_transactions", "withdrawal"]
# Generating features for each prefix and merging them
all_features = {}
for prefix in columns_to_process:
all_features.update(calculate_features(X, prefix))
# Creating the final dataframe with new features
df_features = pd.DataFrame(all_features)
X_feature_engineered = pd.concat([X_res, df_features], axis=1)
display(X_feature_engineered.head(5))| account_id | date_account | district_id_client | age | amount | duration | payments | household | insurance_payment | leasing | loan_payment | unknown | num_of_habitat_account | num_of_small_town_account | num_of_medium_town_account | num_of_big_town_account | num_of_bigger_town_account | num_of_city_account | ratio_of_urban_account | average_salary_account | unemploy_rate95_account | unemploy_rate96_account | n_of_enterpren_per1000_inhabit_account | no_of_crimes95_account | no_of_crimes96_account | num_of_habitat_client | num_of_small_town_client | num_of_medium_town_client | num_of_big_town_client | num_of_bigger_town_client | num_of_city_client | ratio_of_urban_client | average_salary_client | unemploy_rate95_client | unemploy_rate96_client | n_of_enterpren_per1000_inhabit_client | no_of_crimes95_client | no_of_crimes96_client | balance_1 | balance_2 | balance_3 | balance_4 | balance_5 | balance_6 | balance_7 | balance_8 | balance_9 | balance_10 | balance_11 | balance_12 | balance_13 | credit_1 | credit_2 | credit_3 | credit_4 | credit_5 | credit_6 | credit_7 | credit_8 | credit_9 | credit_10 | credit_11 | credit_12 | credit_13 | n_transactions_1 | n_transactions_2 | n_transactions_3 | n_transactions_4 | n_transactions_5 | n_transactions_6 | n_transactions_7 | n_transactions_8 | n_transactions_9 | n_transactions_10 | n_transactions_11 | n_transactions_12 | n_transactions_13 | volume_1 | volume_2 | volume_3 | volume_4 | volume_5 | volume_6 | volume_7 | volume_8 | volume_9 | volume_10 | volume_11 | volume_12 | volume_13 | withdrawal_1 | withdrawal_2 | withdrawal_3 | withdrawal_4 | withdrawal_5 | withdrawal_6 | withdrawal_7 | withdrawal_8 | withdrawal_9 | withdrawal_10 | withdrawal_11 | withdrawal_12 | withdrawal_13 | frequency_transactional | frequency_weekly | gender_male | status_contract finished | status_finished contract, loan not paid | status_running contract | status_none | district_name_account_Beroun | district_name_account_Blansko | district_name_account_Breclav | district_name_account_Brno - mesto | district_name_account_Brno - venkov | district_name_account_Bruntal | district_name_account_Ceska Lipa | district_name_account_Ceske Budejovice | district_name_account_Cesky Krumlov | district_name_account_Cheb | district_name_account_Chomutov | district_name_account_Chrudim | district_name_account_Decin | district_name_account_Domazlice | district_name_account_Frydek - Mistek | district_name_account_Havlickuv Brod | district_name_account_Hl.m. Praha | district_name_account_Hodonin | district_name_account_Hradec Kralove | district_name_account_Jablonec n. Nisou | district_name_account_Jesenik | district_name_account_Jicin | district_name_account_Jihlava | district_name_account_Jindrichuv Hradec | district_name_account_Karlovy Vary | district_name_account_Karvina | district_name_account_Kladno | district_name_account_Klatovy | district_name_account_Kolin | district_name_account_Kromeriz | district_name_account_Kutna Hora | district_name_account_Liberec | district_name_account_Litomerice | district_name_account_Louny | district_name_account_Melnik | district_name_account_Mlada Boleslav | district_name_account_Most | district_name_account_Nachod | district_name_account_Novy Jicin | district_name_account_Nymburk | district_name_account_Olomouc | district_name_account_Opava | district_name_account_Ostrava - mesto | district_name_account_Pardubice | district_name_account_Pelhrimov | district_name_account_Pisek | district_name_account_Plzen - jih | district_name_account_Plzen - mesto | district_name_account_Plzen - sever | district_name_account_Prachatice | district_name_account_Praha - vychod | district_name_account_Praha - zapad | district_name_account_Prerov | district_name_account_Pribram | district_name_account_Prostejov | district_name_account_Rakovnik | district_name_account_Rokycany | district_name_account_Rychnov nad Kneznou | district_name_account_Semily | district_name_account_Sokolov | district_name_account_Strakonice | district_name_account_Sumperk | district_name_account_Svitavy | district_name_account_Tabor | district_name_account_Tachov | district_name_account_Teplice | district_name_account_Trebic | district_name_account_Trutnov | district_name_account_Uherske Hradiste | district_name_account_Usti nad Labem | district_name_account_Usti nad Orlici | district_name_account_Vsetin | district_name_account_Vyskov | district_name_account_Zdar nad Sazavou | district_name_account_Zlin | district_name_account_Znojmo | region_account_central Bohemia | region_account_east Bohemia | region_account_north Bohemia | region_account_north Moravia | region_account_south Bohemia | region_account_south Moravia | region_account_west Bohemia | district_name_client_Beroun | district_name_client_Blansko | district_name_client_Breclav | district_name_client_Brno - mesto | district_name_client_Brno - venkov | district_name_client_Bruntal | district_name_client_Ceska Lipa | district_name_client_Ceske Budejovice | district_name_client_Cesky Krumlov | district_name_client_Cheb | district_name_client_Chomutov | district_name_client_Chrudim | district_name_client_Decin | district_name_client_Domazlice | district_name_client_Frydek - Mistek | district_name_client_Havlickuv Brod | district_name_client_Hl.m. Praha | district_name_client_Hodonin | district_name_client_Hradec Kralove | district_name_client_Jablonec n. Nisou | district_name_client_Jesenik | district_name_client_Jicin | district_name_client_Jihlava | district_name_client_Jindrichuv Hradec | district_name_client_Karlovy Vary | district_name_client_Karvina | district_name_client_Kladno | district_name_client_Klatovy | district_name_client_Kolin | district_name_client_Kromeriz | district_name_client_Kutna Hora | district_name_client_Liberec | district_name_client_Litomerice | district_name_client_Louny | district_name_client_Melnik | district_name_client_Mlada Boleslav | district_name_client_Most | district_name_client_Nachod | district_name_client_Novy Jicin | district_name_client_Nymburk | district_name_client_Olomouc | district_name_client_Opava | district_name_client_Ostrava - mesto | district_name_client_Pardubice | district_name_client_Pelhrimov | district_name_client_Pisek | district_name_client_Plzen - jih | district_name_client_Plzen - mesto | district_name_client_Plzen - sever | district_name_client_Prachatice | district_name_client_Praha - vychod | district_name_client_Praha - zapad | district_name_client_Prerov | district_name_client_Pribram | district_name_client_Prostejov | district_name_client_Rakovnik | district_name_client_Rokycany | district_name_client_Rychnov nad Kneznou | district_name_client_Semily | district_name_client_Sokolov | district_name_client_Strakonice | district_name_client_Sumperk | district_name_client_Svitavy | district_name_client_Tabor | district_name_client_Tachov | district_name_client_Teplice | district_name_client_Trebic | district_name_client_Trutnov | district_name_client_Uherske Hradiste | district_name_client_Usti nad Labem | district_name_client_Usti nad Orlici | district_name_client_Vsetin | district_name_client_Vyskov | district_name_client_Zdar nad Sazavou | district_name_client_Zlin | district_name_client_Znojmo | region_client_central Bohemia | region_client_east Bohemia | region_client_north Bohemia | region_client_north Moravia | region_client_south Bohemia | region_client_south Moravia | region_client_west Bohemia | balance_mean | balance_min | balance_max | balance_mad | balance_mean_ratio_last3_first3 | balance_std | credit_mean | credit_min | credit_max | credit_mad | credit_mean_ratio_last3_first3 | credit_sum | credit_std | n_transactions_mean | n_transactions_min | n_transactions_max | n_transactions_mad | n_transactions_mean_ratio_last3_first3 | withdrawal_mean | withdrawal_min | withdrawal_max | withdrawal_mad | withdrawal_mean_ratio_last3_first3 | withdrawal_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 812 | 18 | 29 | 0.0 | 0.0 | 0.0 | 2452.0 | 0.0 | 0.0 | 0.0 | 0.0 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 16729.6 | 15450.5 | 15408.2 | 14942.8 | 13174.1 | 14795.8 | 18221.8 | 18037.0 | 16752.2 | 15864.8 | 16655.3 | 13878.8 | 12611.2 | 3745.7 | 3738.9 | 3742.0 | 4235.3 | 3734.9 | 3740.6 | 3751.4 | 3751.4 | 3744.0 | 3746.1 | 5243.1 | 3734.2 | 3733.1 | 4.0 | 6.0 | 5.0 | 5.0 | 10.0 | 5.0 | 5.0 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 1279.1 | 42.3 | 465.4 | 1768.7 | -1621.7 | -3426.0 | 184.8 | 1284.8 | 887.4 | -790.5 | 2776.5 | 1267.6 | 96.5 | -2466.6 | -3696.6 | -3276.6 | -2466.6 | -5356.6 | -7166.6 | -3566.6 | -2466.6 | -2856.6 | -4536.6 | -2466.6 | -2466.6 | -3636.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | 15825.908333 | 13174.1 | 18221.8 | 1217.541667 | 0.975006 | 1536.614777 | 3908.966667 | 3734.2 | 5243.1 | 276.744444 | 1.133326 | 46907.6 | 443.380170 | 5.333333 | 4.0 | 10.0 | 1.000000 | 1.0000 | -3565.766667 | -7166.6 | -2466.6 | 1082.361111 | 1.003178 | -42789.2 |
| 1 | 2 | 56 | 1 | 54 | 80952.0 | 24.0 | 3373.0 | 7266.0 | 0.0 | 0.0 | 3372.7 | 0.0 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 44015.2 | 42865.3 | 50313.5 | 48541.2 | 36767.2 | 40557.9 | 29056.8 | 40875.9 | 31781.1 | 53875.0 | 51794.3 | 51644.3 | 52091.7 | 20430.5 | 20430.4 | 20452.9 | 30554.6 | 20389.9 | 22181.7 | 20380.5 | 20375.4 | 21686.7 | 30561.3 | 20430.6 | 20433.2 | 20427.3 | 6.0 | 8.0 | 6.0 | 6.0 | 7.0 | 6.0 | 7.0 | 5.0 | 11.0 | 7.0 | 6.0 | 6.0 | 6.0 | 1149.9 | -7448.2 | 1772.3 | 11774.0 | -3790.7 | 11501.1 | -11819.1 | 9094.8 | -22093.9 | 2080.7 | 150.0 | -447.4 | 8346.7 | -19280.6 | -27878.6 | -18680.6 | -18780.6 | -24180.6 | -10680.6 | -32199.6 | -11280.6 | -43780.6 | -28480.6 | -20280.6 | -20880.6 | -12080.6 | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | 43507.308333 | 29056.8 | 53875.0 | 6523.275000 | 1.146651 | 8102.086523 | 22358.975000 | 20375.4 | 30561.3 | 2732.991667 | 1.164911 | 268307.7 | 3874.799149 | 6.750000 | 5.0 | 11.0 | 1.041667 | 0.9500 | -23032.016667 | -43780.6 | -10680.6 | 6893.319444 | 1.057746 | -276384.2 |
| 2 | 6 | 634 | 51 | 61 | 0.0 | 0.0 | 0.0 | 3954.0 | 0.0 | 0.0 | 0.0 | 0.0 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 40099.3 | 37238.2 | 35585.5 | 42793.9 | 42156.0 | 39286.2 | 38664.2 | 35809.0 | 45988.3 | 44140.3 | 41261.9 | 38395.2 | 35540.5 | 6829.7 | 6821.3 | 6840.2 | 6846.5 | 6838.4 | 6830.6 | 6823.8 | 6839.3 | 6856.6 | 6847.0 | 6835.3 | 6823.3 | 6814.1 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 4.0 | 11.0 | 6.0 | 4.0 | 4.0 | 4.0 | 5.0 | 2861.1 | 1652.7 | -7208.4 | 637.9 | 2869.8 | 622.0 | 2855.2 | -10179.3 | 1848.0 | 2878.4 | 2866.7 | 2854.7 | 1725.5 | -3968.6 | -5168.6 | -14048.6 | -6208.6 | -3968.6 | -6208.6 | -3968.6 | -17018.6 | -5008.6 | -3968.6 | -3968.6 | -3968.6 | -5088.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | 40118.166667 | 35585.5 | 45988.3 | 2624.927778 | 1.096299 | 3253.526919 | 6836.000000 | 6821.3 | 6856.6 | 8.666667 | 1.000703 | 82032.0 | 10.810180 | 5.166667 | 4.0 | 11.0 | 1.250000 | 0.8000 | -6456.100000 | -17018.6 | -3968.6 | 3025.833333 | 0.513495 | -77473.2 |
| 3 | 7 | 1423 | 60 | 64 | 0.0 | 0.0 | 0.0 | 4880.0 | 0.0 | 0.0 | 0.0 | 0.0 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 75336.5 | 75049.4 | 75577.3 | 72590.7 | 65368.4 | 67762.9 | 64207.9 | 60067.1 | 50259.0 | 61397.2 | 57823.3 | 58197.0 | 53779.8 | 22981.7 | 22986.7 | 22981.2 | 34296.9 | 22960.1 | 22949.6 | 22935.4 | 22902.7 | 22916.4 | 34268.5 | 22920.9 | 22911.8 | 22896.3 | 5.0 | 6.0 | 5.0 | 6.0 | 6.0 | 6.0 | 5.0 | 5.0 | 11.0 | 5.0 | 5.0 | 5.0 | 5.0 | 287.1 | -527.9 | 2986.6 | 7222.3 | -2394.5 | 3555.0 | 4140.8 | 9808.1 | -11138.2 | 3573.9 | -373.7 | 4417.2 | 3601.7 | -22694.6 | -23514.6 | -19994.6 | -27074.6 | -25354.6 | -19394.6 | -18794.6 | -13094.6 | -34054.6 | -30694.6 | -23294.6 | -18494.6 | -19294.6 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | 65303.058333 | 50259.0 | 75577.3 | 6644.475000 | 0.785161 | 8185.479842 | 24834.325000 | 22902.7 | 34296.9 | 3149.458333 | 1.161736 | 298011.9 | 4413.427098 | 5.833333 | 5.0 | 11.0 | 0.972222 | 0.9375 | -23037.933333 | -34054.6 | -13094.6 | 4293.333333 | 1.094859 | -276455.2 |
| 4 | 14 | 1426 | 47 | 57 | 0.0 | 0.0 | 0.0 | 3629.0 | 0.0 | 0.0 | 0.0 | 0.0 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 37267.1 | 35586.0 | 39596.5 | 35521.3 | 37235.6 | 39639.8 | 36948.6 | 37158.9 | 36070.9 | 34195.0 | 34812.2 | 37116.3 | 36107.7 | 14924.7 | 14933.1 | 22318.8 | 14929.3 | 14939.4 | 14934.8 | 14933.3 | 14931.6 | 22319.5 | 14926.4 | 14910.5 | 14908.6 | 14888.4 | 5.0 | 5.0 | 6.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.0 | 3.0 | 3.0 | 1681.1 | -4010.5 | 4075.2 | -1714.3 | -2404.2 | 2691.2 | -210.3 | 1088.0 | 1875.9 | -617.2 | -2304.1 | 1008.6 | 5488.4 | -13243.6 | -18943.6 | -18243.6 | -16643.6 | -17343.6 | -12243.6 | -15143.6 | -13843.6 | -20443.6 | -15543.6 | -17214.6 | -13900.0 | -9400.0 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Remove the Variable that can lead to data leakage
def clean_data(df):
# Define unnecessary columns
unnecessary_cols = [
"disp_id",
"client_id",
"account_id",
"type_card",
"card_id",
"loan_id",
"district_id_account",
"district_id_client",
]
# Drop these columns if they exist in the dataframe
df_cleaned = df.drop(columns=[col for col in unnecessary_cols if col in df.columns])
return df_cleaned
X_res = clean_data(X_res)
X_feature_engineered = clean_data(X_feature_engineered)display(X_feature_engineered.head(5))
display(X_res.head(5))| date_account | age | amount | duration | payments | household | insurance_payment | leasing | loan_payment | unknown | num_of_habitat_account | num_of_small_town_account | num_of_medium_town_account | num_of_big_town_account | num_of_bigger_town_account | num_of_city_account | ratio_of_urban_account | average_salary_account | unemploy_rate95_account | unemploy_rate96_account | n_of_enterpren_per1000_inhabit_account | no_of_crimes95_account | no_of_crimes96_account | num_of_habitat_client | num_of_small_town_client | num_of_medium_town_client | num_of_big_town_client | num_of_bigger_town_client | num_of_city_client | ratio_of_urban_client | average_salary_client | unemploy_rate95_client | unemploy_rate96_client | n_of_enterpren_per1000_inhabit_client | no_of_crimes95_client | no_of_crimes96_client | balance_1 | balance_2 | balance_3 | balance_4 | balance_5 | balance_6 | balance_7 | balance_8 | balance_9 | balance_10 | balance_11 | balance_12 | balance_13 | credit_1 | credit_2 | credit_3 | credit_4 | credit_5 | credit_6 | credit_7 | credit_8 | credit_9 | credit_10 | credit_11 | credit_12 | credit_13 | n_transactions_1 | n_transactions_2 | n_transactions_3 | n_transactions_4 | n_transactions_5 | n_transactions_6 | n_transactions_7 | n_transactions_8 | n_transactions_9 | n_transactions_10 | n_transactions_11 | n_transactions_12 | n_transactions_13 | volume_1 | volume_2 | volume_3 | volume_4 | volume_5 | volume_6 | volume_7 | volume_8 | volume_9 | volume_10 | volume_11 | volume_12 | volume_13 | withdrawal_1 | withdrawal_2 | withdrawal_3 | withdrawal_4 | withdrawal_5 | withdrawal_6 | withdrawal_7 | withdrawal_8 | withdrawal_9 | withdrawal_10 | withdrawal_11 | withdrawal_12 | withdrawal_13 | frequency_transactional | frequency_weekly | gender_male | status_contract finished | status_finished contract, loan not paid | status_running contract | status_none | district_name_account_Beroun | district_name_account_Blansko | district_name_account_Breclav | district_name_account_Brno - mesto | district_name_account_Brno - venkov | district_name_account_Bruntal | district_name_account_Ceska Lipa | district_name_account_Ceske Budejovice | district_name_account_Cesky Krumlov | district_name_account_Cheb | district_name_account_Chomutov | district_name_account_Chrudim | district_name_account_Decin | district_name_account_Domazlice | district_name_account_Frydek - Mistek | district_name_account_Havlickuv Brod | district_name_account_Hl.m. Praha | district_name_account_Hodonin | district_name_account_Hradec Kralove | district_name_account_Jablonec n. Nisou | district_name_account_Jesenik | district_name_account_Jicin | district_name_account_Jihlava | district_name_account_Jindrichuv Hradec | district_name_account_Karlovy Vary | district_name_account_Karvina | district_name_account_Kladno | district_name_account_Klatovy | district_name_account_Kolin | district_name_account_Kromeriz | district_name_account_Kutna Hora | district_name_account_Liberec | district_name_account_Litomerice | district_name_account_Louny | district_name_account_Melnik | district_name_account_Mlada Boleslav | district_name_account_Most | district_name_account_Nachod | district_name_account_Novy Jicin | district_name_account_Nymburk | district_name_account_Olomouc | district_name_account_Opava | district_name_account_Ostrava - mesto | district_name_account_Pardubice | district_name_account_Pelhrimov | district_name_account_Pisek | district_name_account_Plzen - jih | district_name_account_Plzen - mesto | district_name_account_Plzen - sever | district_name_account_Prachatice | district_name_account_Praha - vychod | district_name_account_Praha - zapad | district_name_account_Prerov | district_name_account_Pribram | district_name_account_Prostejov | district_name_account_Rakovnik | district_name_account_Rokycany | district_name_account_Rychnov nad Kneznou | district_name_account_Semily | district_name_account_Sokolov | district_name_account_Strakonice | district_name_account_Sumperk | district_name_account_Svitavy | district_name_account_Tabor | district_name_account_Tachov | district_name_account_Teplice | district_name_account_Trebic | district_name_account_Trutnov | district_name_account_Uherske Hradiste | district_name_account_Usti nad Labem | district_name_account_Usti nad Orlici | district_name_account_Vsetin | district_name_account_Vyskov | district_name_account_Zdar nad Sazavou | district_name_account_Zlin | district_name_account_Znojmo | region_account_central Bohemia | region_account_east Bohemia | region_account_north Bohemia | region_account_north Moravia | region_account_south Bohemia | region_account_south Moravia | region_account_west Bohemia | district_name_client_Beroun | district_name_client_Blansko | district_name_client_Breclav | district_name_client_Brno - mesto | district_name_client_Brno - venkov | district_name_client_Bruntal | district_name_client_Ceska Lipa | district_name_client_Ceske Budejovice | district_name_client_Cesky Krumlov | district_name_client_Cheb | district_name_client_Chomutov | district_name_client_Chrudim | district_name_client_Decin | district_name_client_Domazlice | district_name_client_Frydek - Mistek | district_name_client_Havlickuv Brod | district_name_client_Hl.m. Praha | district_name_client_Hodonin | district_name_client_Hradec Kralove | district_name_client_Jablonec n. Nisou | district_name_client_Jesenik | district_name_client_Jicin | district_name_client_Jihlava | district_name_client_Jindrichuv Hradec | district_name_client_Karlovy Vary | district_name_client_Karvina | district_name_client_Kladno | district_name_client_Klatovy | district_name_client_Kolin | district_name_client_Kromeriz | district_name_client_Kutna Hora | district_name_client_Liberec | district_name_client_Litomerice | district_name_client_Louny | district_name_client_Melnik | district_name_client_Mlada Boleslav | district_name_client_Most | district_name_client_Nachod | district_name_client_Novy Jicin | district_name_client_Nymburk | district_name_client_Olomouc | district_name_client_Opava | district_name_client_Ostrava - mesto | district_name_client_Pardubice | district_name_client_Pelhrimov | district_name_client_Pisek | district_name_client_Plzen - jih | district_name_client_Plzen - mesto | district_name_client_Plzen - sever | district_name_client_Prachatice | district_name_client_Praha - vychod | district_name_client_Praha - zapad | district_name_client_Prerov | district_name_client_Pribram | district_name_client_Prostejov | district_name_client_Rakovnik | district_name_client_Rokycany | district_name_client_Rychnov nad Kneznou | district_name_client_Semily | district_name_client_Sokolov | district_name_client_Strakonice | district_name_client_Sumperk | district_name_client_Svitavy | district_name_client_Tabor | district_name_client_Tachov | district_name_client_Teplice | district_name_client_Trebic | district_name_client_Trutnov | district_name_client_Uherske Hradiste | district_name_client_Usti nad Labem | district_name_client_Usti nad Orlici | district_name_client_Vsetin | district_name_client_Vyskov | district_name_client_Zdar nad Sazavou | district_name_client_Zlin | district_name_client_Znojmo | region_client_central Bohemia | region_client_east Bohemia | region_client_north Bohemia | region_client_north Moravia | region_client_south Bohemia | region_client_south Moravia | region_client_west Bohemia | balance_mean | balance_min | balance_max | balance_mad | balance_mean_ratio_last3_first3 | balance_std | credit_mean | credit_min | credit_max | credit_mad | credit_mean_ratio_last3_first3 | credit_sum | credit_std | n_transactions_mean | n_transactions_min | n_transactions_max | n_transactions_mad | n_transactions_mean_ratio_last3_first3 | withdrawal_mean | withdrawal_min | withdrawal_max | withdrawal_mad | withdrawal_mean_ratio_last3_first3 | withdrawal_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 812 | 29 | 0.0 | 0.0 | 0.0 | 2452.0 | 0.0 | 0.0 | 0.0 | 0.0 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 16729.6 | 15450.5 | 15408.2 | 14942.8 | 13174.1 | 14795.8 | 18221.8 | 18037.0 | 16752.2 | 15864.8 | 16655.3 | 13878.8 | 12611.2 | 3745.7 | 3738.9 | 3742.0 | 4235.3 | 3734.9 | 3740.6 | 3751.4 | 3751.4 | 3744.0 | 3746.1 | 5243.1 | 3734.2 | 3733.1 | 4.0 | 6.0 | 5.0 | 5.0 | 10.0 | 5.0 | 5.0 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 1279.1 | 42.3 | 465.4 | 1768.7 | -1621.7 | -3426.0 | 184.8 | 1284.8 | 887.4 | -790.5 | 2776.5 | 1267.6 | 96.5 | -2466.6 | -3696.6 | -3276.6 | -2466.6 | -5356.6 | -7166.6 | -3566.6 | -2466.6 | -2856.6 | -4536.6 | -2466.6 | -2466.6 | -3636.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | 15825.908333 | 13174.1 | 18221.8 | 1217.541667 | 0.975006 | 1536.614777 | 3908.966667 | 3734.2 | 5243.1 | 276.744444 | 1.133326 | 46907.6 | 443.380170 | 5.333333 | 4.0 | 10.0 | 1.000000 | 1.0000 | -3565.766667 | -7166.6 | -2466.6 | 1082.361111 | 1.003178 | -42789.2 |
| 1 | 56 | 54 | 80952.0 | 24.0 | 3373.0 | 7266.0 | 0.0 | 0.0 | 3372.7 | 0.0 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 44015.2 | 42865.3 | 50313.5 | 48541.2 | 36767.2 | 40557.9 | 29056.8 | 40875.9 | 31781.1 | 53875.0 | 51794.3 | 51644.3 | 52091.7 | 20430.5 | 20430.4 | 20452.9 | 30554.6 | 20389.9 | 22181.7 | 20380.5 | 20375.4 | 21686.7 | 30561.3 | 20430.6 | 20433.2 | 20427.3 | 6.0 | 8.0 | 6.0 | 6.0 | 7.0 | 6.0 | 7.0 | 5.0 | 11.0 | 7.0 | 6.0 | 6.0 | 6.0 | 1149.9 | -7448.2 | 1772.3 | 11774.0 | -3790.7 | 11501.1 | -11819.1 | 9094.8 | -22093.9 | 2080.7 | 150.0 | -447.4 | 8346.7 | -19280.6 | -27878.6 | -18680.6 | -18780.6 | -24180.6 | -10680.6 | -32199.6 | -11280.6 | -43780.6 | -28480.6 | -20280.6 | -20880.6 | -12080.6 | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | 43507.308333 | 29056.8 | 53875.0 | 6523.275000 | 1.146651 | 8102.086523 | 22358.975000 | 20375.4 | 30561.3 | 2732.991667 | 1.164911 | 268307.7 | 3874.799149 | 6.750000 | 5.0 | 11.0 | 1.041667 | 0.9500 | -23032.016667 | -43780.6 | -10680.6 | 6893.319444 | 1.057746 | -276384.2 |
| 2 | 634 | 61 | 0.0 | 0.0 | 0.0 | 3954.0 | 0.0 | 0.0 | 0.0 | 0.0 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 40099.3 | 37238.2 | 35585.5 | 42793.9 | 42156.0 | 39286.2 | 38664.2 | 35809.0 | 45988.3 | 44140.3 | 41261.9 | 38395.2 | 35540.5 | 6829.7 | 6821.3 | 6840.2 | 6846.5 | 6838.4 | 6830.6 | 6823.8 | 6839.3 | 6856.6 | 6847.0 | 6835.3 | 6823.3 | 6814.1 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 4.0 | 11.0 | 6.0 | 4.0 | 4.0 | 4.0 | 5.0 | 2861.1 | 1652.7 | -7208.4 | 637.9 | 2869.8 | 622.0 | 2855.2 | -10179.3 | 1848.0 | 2878.4 | 2866.7 | 2854.7 | 1725.5 | -3968.6 | -5168.6 | -14048.6 | -6208.6 | -3968.6 | -6208.6 | -3968.6 | -17018.6 | -5008.6 | -3968.6 | -3968.6 | -3968.6 | -5088.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | 40118.166667 | 35585.5 | 45988.3 | 2624.927778 | 1.096299 | 3253.526919 | 6836.000000 | 6821.3 | 6856.6 | 8.666667 | 1.000703 | 82032.0 | 10.810180 | 5.166667 | 4.0 | 11.0 | 1.250000 | 0.8000 | -6456.100000 | -17018.6 | -3968.6 | 3025.833333 | 0.513495 | -77473.2 |
| 3 | 1423 | 64 | 0.0 | 0.0 | 0.0 | 4880.0 | 0.0 | 0.0 | 0.0 | 0.0 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 75336.5 | 75049.4 | 75577.3 | 72590.7 | 65368.4 | 67762.9 | 64207.9 | 60067.1 | 50259.0 | 61397.2 | 57823.3 | 58197.0 | 53779.8 | 22981.7 | 22986.7 | 22981.2 | 34296.9 | 22960.1 | 22949.6 | 22935.4 | 22902.7 | 22916.4 | 34268.5 | 22920.9 | 22911.8 | 22896.3 | 5.0 | 6.0 | 5.0 | 6.0 | 6.0 | 6.0 | 5.0 | 5.0 | 11.0 | 5.0 | 5.0 | 5.0 | 5.0 | 287.1 | -527.9 | 2986.6 | 7222.3 | -2394.5 | 3555.0 | 4140.8 | 9808.1 | -11138.2 | 3573.9 | -373.7 | 4417.2 | 3601.7 | -22694.6 | -23514.6 | -19994.6 | -27074.6 | -25354.6 | -19394.6 | -18794.6 | -13094.6 | -34054.6 | -30694.6 | -23294.6 | -18494.6 | -19294.6 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | 65303.058333 | 50259.0 | 75577.3 | 6644.475000 | 0.785161 | 8185.479842 | 24834.325000 | 22902.7 | 34296.9 | 3149.458333 | 1.161736 | 298011.9 | 4413.427098 | 5.833333 | 5.0 | 11.0 | 0.972222 | 0.9375 | -23037.933333 | -34054.6 | -13094.6 | 4293.333333 | 1.094859 | -276455.2 |
| 4 | 1426 | 57 | 0.0 | 0.0 | 0.0 | 3629.0 | 0.0 | 0.0 | 0.0 | 0.0 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 37267.1 | 35586.0 | 39596.5 | 35521.3 | 37235.6 | 39639.8 | 36948.6 | 37158.9 | 36070.9 | 34195.0 | 34812.2 | 37116.3 | 36107.7 | 14924.7 | 14933.1 | 22318.8 | 14929.3 | 14939.4 | 14934.8 | 14933.3 | 14931.6 | 22319.5 | 14926.4 | 14910.5 | 14908.6 | 14888.4 | 5.0 | 5.0 | 6.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.0 | 3.0 | 3.0 | 1681.1 | -4010.5 | 4075.2 | -1714.3 | -2404.2 | 2691.2 | -210.3 | 1088.0 | 1875.9 | -617.2 | -2304.1 | 1008.6 | 5488.4 | -13243.6 | -18943.6 | -18243.6 | -16643.6 | -17343.6 | -12243.6 | -15143.6 | -13843.6 | -20443.6 | -15543.6 | -17214.6 | -13900.0 | -9400.0 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| date_account | age | amount | duration | payments | household | insurance_payment | leasing | loan_payment | unknown | num_of_habitat_account | num_of_small_town_account | num_of_medium_town_account | num_of_big_town_account | num_of_bigger_town_account | num_of_city_account | ratio_of_urban_account | average_salary_account | unemploy_rate95_account | unemploy_rate96_account | n_of_enterpren_per1000_inhabit_account | no_of_crimes95_account | no_of_crimes96_account | num_of_habitat_client | num_of_small_town_client | num_of_medium_town_client | num_of_big_town_client | num_of_bigger_town_client | num_of_city_client | ratio_of_urban_client | average_salary_client | unemploy_rate95_client | unemploy_rate96_client | n_of_enterpren_per1000_inhabit_client | no_of_crimes95_client | no_of_crimes96_client | balance_1 | balance_2 | balance_3 | balance_4 | balance_5 | balance_6 | balance_7 | balance_8 | balance_9 | balance_10 | balance_11 | balance_12 | balance_13 | credit_1 | credit_2 | credit_3 | credit_4 | credit_5 | credit_6 | credit_7 | credit_8 | credit_9 | credit_10 | credit_11 | credit_12 | credit_13 | n_transactions_1 | n_transactions_2 | n_transactions_3 | n_transactions_4 | n_transactions_5 | n_transactions_6 | n_transactions_7 | n_transactions_8 | n_transactions_9 | n_transactions_10 | n_transactions_11 | n_transactions_12 | n_transactions_13 | volume_1 | volume_2 | volume_3 | volume_4 | volume_5 | volume_6 | volume_7 | volume_8 | volume_9 | volume_10 | volume_11 | volume_12 | volume_13 | withdrawal_1 | withdrawal_2 | withdrawal_3 | withdrawal_4 | withdrawal_5 | withdrawal_6 | withdrawal_7 | withdrawal_8 | withdrawal_9 | withdrawal_10 | withdrawal_11 | withdrawal_12 | withdrawal_13 | frequency_transactional | frequency_weekly | gender_male | status_contract finished | status_finished contract, loan not paid | status_running contract | status_none | district_name_account_Beroun | district_name_account_Blansko | district_name_account_Breclav | district_name_account_Brno - mesto | district_name_account_Brno - venkov | district_name_account_Bruntal | district_name_account_Ceska Lipa | district_name_account_Ceske Budejovice | district_name_account_Cesky Krumlov | district_name_account_Cheb | district_name_account_Chomutov | district_name_account_Chrudim | district_name_account_Decin | district_name_account_Domazlice | district_name_account_Frydek - Mistek | district_name_account_Havlickuv Brod | district_name_account_Hl.m. Praha | district_name_account_Hodonin | district_name_account_Hradec Kralove | district_name_account_Jablonec n. Nisou | district_name_account_Jesenik | district_name_account_Jicin | district_name_account_Jihlava | district_name_account_Jindrichuv Hradec | district_name_account_Karlovy Vary | district_name_account_Karvina | district_name_account_Kladno | district_name_account_Klatovy | district_name_account_Kolin | district_name_account_Kromeriz | district_name_account_Kutna Hora | district_name_account_Liberec | district_name_account_Litomerice | district_name_account_Louny | district_name_account_Melnik | district_name_account_Mlada Boleslav | district_name_account_Most | district_name_account_Nachod | district_name_account_Novy Jicin | district_name_account_Nymburk | district_name_account_Olomouc | district_name_account_Opava | district_name_account_Ostrava - mesto | district_name_account_Pardubice | district_name_account_Pelhrimov | district_name_account_Pisek | district_name_account_Plzen - jih | district_name_account_Plzen - mesto | district_name_account_Plzen - sever | district_name_account_Prachatice | district_name_account_Praha - vychod | district_name_account_Praha - zapad | district_name_account_Prerov | district_name_account_Pribram | district_name_account_Prostejov | district_name_account_Rakovnik | district_name_account_Rokycany | district_name_account_Rychnov nad Kneznou | district_name_account_Semily | district_name_account_Sokolov | district_name_account_Strakonice | district_name_account_Sumperk | district_name_account_Svitavy | district_name_account_Tabor | district_name_account_Tachov | district_name_account_Teplice | district_name_account_Trebic | district_name_account_Trutnov | district_name_account_Uherske Hradiste | district_name_account_Usti nad Labem | district_name_account_Usti nad Orlici | district_name_account_Vsetin | district_name_account_Vyskov | district_name_account_Zdar nad Sazavou | district_name_account_Zlin | district_name_account_Znojmo | region_account_central Bohemia | region_account_east Bohemia | region_account_north Bohemia | region_account_north Moravia | region_account_south Bohemia | region_account_south Moravia | region_account_west Bohemia | district_name_client_Beroun | district_name_client_Blansko | district_name_client_Breclav | district_name_client_Brno - mesto | district_name_client_Brno - venkov | district_name_client_Bruntal | district_name_client_Ceska Lipa | district_name_client_Ceske Budejovice | district_name_client_Cesky Krumlov | district_name_client_Cheb | district_name_client_Chomutov | district_name_client_Chrudim | district_name_client_Decin | district_name_client_Domazlice | district_name_client_Frydek - Mistek | district_name_client_Havlickuv Brod | district_name_client_Hl.m. Praha | district_name_client_Hodonin | district_name_client_Hradec Kralove | district_name_client_Jablonec n. Nisou | district_name_client_Jesenik | district_name_client_Jicin | district_name_client_Jihlava | district_name_client_Jindrichuv Hradec | district_name_client_Karlovy Vary | district_name_client_Karvina | district_name_client_Kladno | district_name_client_Klatovy | district_name_client_Kolin | district_name_client_Kromeriz | district_name_client_Kutna Hora | district_name_client_Liberec | district_name_client_Litomerice | district_name_client_Louny | district_name_client_Melnik | district_name_client_Mlada Boleslav | district_name_client_Most | district_name_client_Nachod | district_name_client_Novy Jicin | district_name_client_Nymburk | district_name_client_Olomouc | district_name_client_Opava | district_name_client_Ostrava - mesto | district_name_client_Pardubice | district_name_client_Pelhrimov | district_name_client_Pisek | district_name_client_Plzen - jih | district_name_client_Plzen - mesto | district_name_client_Plzen - sever | district_name_client_Prachatice | district_name_client_Praha - vychod | district_name_client_Praha - zapad | district_name_client_Prerov | district_name_client_Pribram | district_name_client_Prostejov | district_name_client_Rakovnik | district_name_client_Rokycany | district_name_client_Rychnov nad Kneznou | district_name_client_Semily | district_name_client_Sokolov | district_name_client_Strakonice | district_name_client_Sumperk | district_name_client_Svitavy | district_name_client_Tabor | district_name_client_Tachov | district_name_client_Teplice | district_name_client_Trebic | district_name_client_Trutnov | district_name_client_Uherske Hradiste | district_name_client_Usti nad Labem | district_name_client_Usti nad Orlici | district_name_client_Vsetin | district_name_client_Vyskov | district_name_client_Zdar nad Sazavou | district_name_client_Zlin | district_name_client_Znojmo | region_client_central Bohemia | region_client_east Bohemia | region_client_north Bohemia | region_client_north Moravia | region_client_south Bohemia | region_client_south Moravia | region_client_west Bohemia | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 812 | 29 | 0.0 | 0.0 | 0.0 | 2452.0 | 0.0 | 0.0 | 0.0 | 0.0 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 70699 | 60 | 13 | 2 | 1 | 4 | 65.3 | 8968 | 2.83 | 3.35 | 131 | 1740.0 | 1910 | 16729.6 | 15450.5 | 15408.2 | 14942.8 | 13174.1 | 14795.8 | 18221.8 | 18037.0 | 16752.2 | 15864.8 | 16655.3 | 13878.8 | 12611.2 | 3745.7 | 3738.9 | 3742.0 | 4235.3 | 3734.9 | 3740.6 | 3751.4 | 3751.4 | 3744.0 | 3746.1 | 5243.1 | 3734.2 | 3733.1 | 4.0 | 6.0 | 5.0 | 5.0 | 10.0 | 5.0 | 5.0 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 1279.1 | 42.3 | 465.4 | 1768.7 | -1621.7 | -3426.0 | 184.8 | 1284.8 | 887.4 | -790.5 | 2776.5 | 1267.6 | 96.5 | -2466.6 | -3696.6 | -3276.6 | -2466.6 | -5356.6 | -7166.6 | -3566.6 | -2466.6 | -2856.6 | -4536.6 | -2466.6 | -2466.6 | -3636.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False |
| 1 | 56 | 54 | 80952.0 | 24.0 | 3373.0 | 7266.0 | 0.0 | 0.0 | 3372.7 | 0.0 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 1204953 | 0 | 0 | 0 | 1 | 1 | 100.0 | 12541 | 0.29 | 0.43 | 167 | 85677.0 | 99107 | 44015.2 | 42865.3 | 50313.5 | 48541.2 | 36767.2 | 40557.9 | 29056.8 | 40875.9 | 31781.1 | 53875.0 | 51794.3 | 51644.3 | 52091.7 | 20430.5 | 20430.4 | 20452.9 | 30554.6 | 20389.9 | 22181.7 | 20380.5 | 20375.4 | 21686.7 | 30561.3 | 20430.6 | 20433.2 | 20427.3 | 6.0 | 8.0 | 6.0 | 6.0 | 7.0 | 6.0 | 7.0 | 5.0 | 11.0 | 7.0 | 6.0 | 6.0 | 6.0 | 1149.9 | -7448.2 | 1772.3 | 11774.0 | -3790.7 | 11501.1 | -11819.1 | 9094.8 | -22093.9 | 2080.7 | 150.0 | -447.4 | 8346.7 | -19280.6 | -27878.6 | -18680.6 | -18780.6 | -24180.6 | -10680.6 | -32199.6 | -11280.6 | -43780.6 | -28480.6 | -20280.6 | -20880.6 | -12080.6 | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | 634 | 61 | 0.0 | 0.0 | 0.0 | 3954.0 | 0.0 | 0.0 | 0.0 | 0.0 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 121947 | 37 | 28 | 7 | 3 | 11 | 70.5 | 8541 | 2.51 | 2.97 | 131 | 3496.0 | 3839 | 40099.3 | 37238.2 | 35585.5 | 42793.9 | 42156.0 | 39286.2 | 38664.2 | 35809.0 | 45988.3 | 44140.3 | 41261.9 | 38395.2 | 35540.5 | 6829.7 | 6821.3 | 6840.2 | 6846.5 | 6838.4 | 6830.6 | 6823.8 | 6839.3 | 6856.6 | 6847.0 | 6835.3 | 6823.3 | 6814.1 | 4.0 | 5.0 | 6.0 | 5.0 | 4.0 | 5.0 | 4.0 | 11.0 | 6.0 | 4.0 | 4.0 | 4.0 | 5.0 | 2861.1 | 1652.7 | -7208.4 | 637.9 | 2869.8 | 622.0 | 2855.2 | -10179.3 | 1848.0 | 2878.4 | 2866.7 | 2854.7 | 1725.5 | -3968.6 | -5168.6 | -14048.6 | -6208.6 | -3968.6 | -6208.6 | -3968.6 | -17018.6 | -5008.6 | -3968.6 | -3968.6 | -3968.6 | -5088.6 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False |
| 3 | 1423 | 64 | 0.0 | 0.0 | 0.0 | 4880.0 | 0.0 | 0.0 | 0.0 | 0.0 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 110643 | 49 | 41 | 4 | 1 | 4 | 51.9 | 8441 | 3.45 | 4.48 | 115 | 1879.0 | 2252 | 75336.5 | 75049.4 | 75577.3 | 72590.7 | 65368.4 | 67762.9 | 64207.9 | 60067.1 | 50259.0 | 61397.2 | 57823.3 | 58197.0 | 53779.8 | 22981.7 | 22986.7 | 22981.2 | 34296.9 | 22960.1 | 22949.6 | 22935.4 | 22902.7 | 22916.4 | 34268.5 | 22920.9 | 22911.8 | 22896.3 | 5.0 | 6.0 | 5.0 | 6.0 | 6.0 | 6.0 | 5.0 | 5.0 | 11.0 | 5.0 | 5.0 | 5.0 | 5.0 | 287.1 | -527.9 | 2986.6 | 7222.3 | -2394.5 | 3555.0 | 4140.8 | 9808.1 | -11138.2 | 3573.9 | -373.7 | 4417.2 | 3601.7 | -22694.6 | -23514.6 | -19994.6 | -27074.6 | -25354.6 | -19394.6 | -18794.6 | -13094.6 | -34054.6 | -30694.6 | -23294.6 | -18494.6 | -19294.6 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False |
| 4 | 1426 | 57 | 0.0 | 0.0 | 0.0 | 3629.0 | 0.0 | 0.0 | 0.0 | 0.0 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 162580 | 83 | 26 | 5 | 1 | 6 | 72.8 | 9538 | 1.51 | 1.81 | 111 | 6079.0 | 5410 | 37267.1 | 35586.0 | 39596.5 | 35521.3 | 37235.6 | 39639.8 | 36948.6 | 37158.9 | 36070.9 | 34195.0 | 34812.2 | 37116.3 | 36107.7 | 14924.7 | 14933.1 | 22318.8 | 14929.3 | 14939.4 | 14934.8 | 14933.3 | 14931.6 | 22319.5 | 14926.4 | 14910.5 | 14908.6 | 14888.4 | 5.0 | 5.0 | 6.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4.0 | 3.0 | 3.0 | 1681.1 | -4010.5 | 4075.2 | -1714.3 | -2404.2 | 2691.2 | -210.3 | 1088.0 | 1875.9 | -617.2 | -2304.1 | 1008.6 | 5488.4 | -13243.6 | -18943.6 | -18243.6 | -16643.6 | -17343.6 | -12243.6 | -15143.6 | -13843.6 | -20443.6 | -15543.6 | -17214.6 | -13900.0 | -9400.0 | False | False | True | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False |
# only print the columns with missing values
X_res.isnull().sum()[X_res.isnull().sum() > 0]
X_feature_engineered.isnull().sum()[X_feature_engineered.isnull().sum() > 0]balance_mean 996
balance_min 996
balance_max 996
balance_mad 996
balance_mean_ratio_last3_first3 996
balance_std 996
credit_mean 996
credit_min 996
credit_max 996
credit_mad 996
credit_mean_ratio_last3_first3 996
credit_sum 996
credit_std 996
n_transactions_mean 996
n_transactions_min 996
n_transactions_max 996
n_transactions_mad 996
n_transactions_mean_ratio_last3_first3 996
withdrawal_mean 996
withdrawal_min 996
withdrawal_max 996
withdrawal_mad 996
withdrawal_mean_ratio_last3_first3 996
withdrawal_sum 996
dtype: int64
# impute missing values with knn imputation but in dataframe
# import KNNImputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X_res = pd.DataFrame(imputer.fit_transform(X_res), columns=X_res.columns)
X_feature_engineered = pd.DataFrame(
imputer.fit_transform(X_feature_engineered), columns=X_feature_engineered.columns
)# only print the columns with missing values
X_res.isnull().sum()[X_res.isnull().sum() > 0]
X_feature_engineered.isnull().sum()[X_feature_engineered.isnull().sum() > 0]Series([], dtype: int64)
# Nomalize the data and standardize the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
X_res = pd.DataFrame(scaler.fit_transform(X_res), columns=X_res.columns)
X_feature_engineered = pd.DataFrame(
scaler.fit_transform(X_feature_engineered), columns=X_feature_engineered.columns
)5. Evaluations Daten
# Assuming 'X' is your DataFrame and 'has_card' is the target variable
# y_features = X_feature_engineered["has_card"]
# X_feature_engineered.drop("has_card", axis=1, inplace=True)
# we use kfold for cross validation and then the X_test and y_test are used for evaluation on never seen data
X_train, X_test, y_train, y_test = train_test_split(
X_res, y_res, test_size=0.1, random_state=42, stratify=y_res
)
X_train_features, X_test_features, y_train_features, y_test_features = train_test_split(
X_feature_engineered, y_res, test_size=0.1, random_state=42, stratify=y_res
)Drop Featrues
X_train.dtypesdate_account float64
age float64
amount float64
duration float64
payments float64
household float64
insurance_payment float64
leasing float64
loan_payment float64
unknown float64
num_of_habitat_account float64
num_of_small_town_account float64
num_of_medium_town_account float64
num_of_big_town_account float64
num_of_bigger_town_account float64
num_of_city_account float64
ratio_of_urban_account float64
average_salary_account float64
unemploy_rate95_account float64
unemploy_rate96_account float64
n_of_enterpren_per1000_inhabit_account float64
no_of_crimes95_account float64
no_of_crimes96_account float64
num_of_habitat_client float64
num_of_small_town_client float64
num_of_medium_town_client float64
num_of_big_town_client float64
num_of_bigger_town_client float64
num_of_city_client float64
ratio_of_urban_client float64
average_salary_client float64
unemploy_rate95_client float64
unemploy_rate96_client float64
n_of_enterpren_per1000_inhabit_client float64
no_of_crimes95_client float64
no_of_crimes96_client float64
balance_1 float64
balance_2 float64
balance_3 float64
balance_4 float64
balance_5 float64
balance_6 float64
balance_7 float64
balance_8 float64
balance_9 float64
balance_10 float64
balance_11 float64
balance_12 float64
balance_13 float64
credit_1 float64
credit_2 float64
credit_3 float64
credit_4 float64
credit_5 float64
credit_6 float64
credit_7 float64
credit_8 float64
credit_9 float64
credit_10 float64
credit_11 float64
credit_12 float64
credit_13 float64
n_transactions_1 float64
n_transactions_2 float64
n_transactions_3 float64
n_transactions_4 float64
n_transactions_5 float64
n_transactions_6 float64
n_transactions_7 float64
n_transactions_8 float64
n_transactions_9 float64
n_transactions_10 float64
n_transactions_11 float64
n_transactions_12 float64
n_transactions_13 float64
volume_1 float64
volume_2 float64
volume_3 float64
volume_4 float64
volume_5 float64
volume_6 float64
volume_7 float64
volume_8 float64
volume_9 float64
volume_10 float64
volume_11 float64
volume_12 float64
volume_13 float64
withdrawal_1 float64
withdrawal_2 float64
withdrawal_3 float64
withdrawal_4 float64
withdrawal_5 float64
withdrawal_6 float64
withdrawal_7 float64
withdrawal_8 float64
withdrawal_9 float64
withdrawal_10 float64
withdrawal_11 float64
withdrawal_12 float64
withdrawal_13 float64
frequency_transactional float64
frequency_weekly float64
gender_male float64
status_contract finished float64
status_finished contract, loan not paid float64
status_running contract float64
status_none float64
district_name_account_Beroun float64
district_name_account_Blansko float64
district_name_account_Breclav float64
district_name_account_Brno - mesto float64
district_name_account_Brno - venkov float64
district_name_account_Bruntal float64
district_name_account_Ceska Lipa float64
district_name_account_Ceske Budejovice float64
district_name_account_Cesky Krumlov float64
district_name_account_Cheb float64
district_name_account_Chomutov float64
district_name_account_Chrudim float64
district_name_account_Decin float64
district_name_account_Domazlice float64
district_name_account_Frydek - Mistek float64
district_name_account_Havlickuv Brod float64
district_name_account_Hl.m. Praha float64
district_name_account_Hodonin float64
district_name_account_Hradec Kralove float64
district_name_account_Jablonec n. Nisou float64
district_name_account_Jesenik float64
district_name_account_Jicin float64
district_name_account_Jihlava float64
district_name_account_Jindrichuv Hradec float64
district_name_account_Karlovy Vary float64
district_name_account_Karvina float64
district_name_account_Kladno float64
district_name_account_Klatovy float64
district_name_account_Kolin float64
district_name_account_Kromeriz float64
district_name_account_Kutna Hora float64
district_name_account_Liberec float64
district_name_account_Litomerice float64
district_name_account_Louny float64
district_name_account_Melnik float64
district_name_account_Mlada Boleslav float64
district_name_account_Most float64
district_name_account_Nachod float64
district_name_account_Novy Jicin float64
district_name_account_Nymburk float64
district_name_account_Olomouc float64
district_name_account_Opava float64
district_name_account_Ostrava - mesto float64
district_name_account_Pardubice float64
district_name_account_Pelhrimov float64
district_name_account_Pisek float64
district_name_account_Plzen - jih float64
district_name_account_Plzen - mesto float64
district_name_account_Plzen - sever float64
district_name_account_Prachatice float64
district_name_account_Praha - vychod float64
district_name_account_Praha - zapad float64
district_name_account_Prerov float64
district_name_account_Pribram float64
district_name_account_Prostejov float64
district_name_account_Rakovnik float64
district_name_account_Rokycany float64
district_name_account_Rychnov nad Kneznou float64
district_name_account_Semily float64
district_name_account_Sokolov float64
district_name_account_Strakonice float64
district_name_account_Sumperk float64
district_name_account_Svitavy float64
district_name_account_Tabor float64
district_name_account_Tachov float64
district_name_account_Teplice float64
district_name_account_Trebic float64
district_name_account_Trutnov float64
district_name_account_Uherske Hradiste float64
district_name_account_Usti nad Labem float64
district_name_account_Usti nad Orlici float64
district_name_account_Vsetin float64
district_name_account_Vyskov float64
district_name_account_Zdar nad Sazavou float64
district_name_account_Zlin float64
district_name_account_Znojmo float64
region_account_central Bohemia float64
region_account_east Bohemia float64
region_account_north Bohemia float64
region_account_north Moravia float64
region_account_south Bohemia float64
region_account_south Moravia float64
region_account_west Bohemia float64
district_name_client_Beroun float64
district_name_client_Blansko float64
district_name_client_Breclav float64
district_name_client_Brno - mesto float64
district_name_client_Brno - venkov float64
district_name_client_Bruntal float64
district_name_client_Ceska Lipa float64
district_name_client_Ceske Budejovice float64
district_name_client_Cesky Krumlov float64
district_name_client_Cheb float64
district_name_client_Chomutov float64
district_name_client_Chrudim float64
district_name_client_Decin float64
district_name_client_Domazlice float64
district_name_client_Frydek - Mistek float64
district_name_client_Havlickuv Brod float64
district_name_client_Hl.m. Praha float64
district_name_client_Hodonin float64
district_name_client_Hradec Kralove float64
district_name_client_Jablonec n. Nisou float64
district_name_client_Jesenik float64
district_name_client_Jicin float64
district_name_client_Jihlava float64
district_name_client_Jindrichuv Hradec float64
district_name_client_Karlovy Vary float64
district_name_client_Karvina float64
district_name_client_Kladno float64
district_name_client_Klatovy float64
district_name_client_Kolin float64
district_name_client_Kromeriz float64
district_name_client_Kutna Hora float64
district_name_client_Liberec float64
district_name_client_Litomerice float64
district_name_client_Louny float64
district_name_client_Melnik float64
district_name_client_Mlada Boleslav float64
district_name_client_Most float64
district_name_client_Nachod float64
district_name_client_Novy Jicin float64
district_name_client_Nymburk float64
district_name_client_Olomouc float64
district_name_client_Opava float64
district_name_client_Ostrava - mesto float64
district_name_client_Pardubice float64
district_name_client_Pelhrimov float64
district_name_client_Pisek float64
district_name_client_Plzen - jih float64
district_name_client_Plzen - mesto float64
district_name_client_Plzen - sever float64
district_name_client_Prachatice float64
district_name_client_Praha - vychod float64
district_name_client_Praha - zapad float64
district_name_client_Prerov float64
district_name_client_Pribram float64
district_name_client_Prostejov float64
district_name_client_Rakovnik float64
district_name_client_Rokycany float64
district_name_client_Rychnov nad Kneznou float64
district_name_client_Semily float64
district_name_client_Sokolov float64
district_name_client_Strakonice float64
district_name_client_Sumperk float64
district_name_client_Svitavy float64
district_name_client_Tabor float64
district_name_client_Tachov float64
district_name_client_Teplice float64
district_name_client_Trebic float64
district_name_client_Trutnov float64
district_name_client_Uherske Hradiste float64
district_name_client_Usti nad Labem float64
district_name_client_Usti nad Orlici float64
district_name_client_Vsetin float64
district_name_client_Vyskov float64
district_name_client_Zdar nad Sazavou float64
district_name_client_Zlin float64
district_name_client_Znojmo float64
region_client_central Bohemia float64
region_client_east Bohemia float64
region_client_north Bohemia float64
region_client_north Moravia float64
region_client_south Bohemia float64
region_client_south Moravia float64
region_client_west Bohemia float64
dtype: object
6. Modeling und Model Selection
Für die Model Selection benutzen wir einen StratifiedKFold mit 10 Folds in dem wir nur den Train split Folden, denn später brauchen wir die Test Daten für den Error Assesment.
from lime.lime_tabular import LimeTabularExplainer
import os
import joblib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import (
roc_curve,
auc,
make_scorer,
fbeta_score,
cohen_kappa_score,
matthews_corrcoef,
confusion_matrix,
ConfusionMatrixDisplay,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Define the directory to save models and CV predictions
MODEL_DIR = "saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)
class ModelEvaluator:
def __init__(self, models, param_grid, X, y, X_test, y_test, selected_fields=None):
"""
Initialize the evaluator with models, their parameter grids, and data.
:param models: dict of (name, model) pairs
:param param_grid: dict of (name, param_grid) pairs for GridSearch
:param X: Feature matrix
:param y: Target vector
:param selected_fields: Fields selected for training
"""
self.benchmark_results = {}
self.models = models
self.param_grid = param_grid
self.X = X[selected_fields]
self.y = y
self.eval_data = X_test
self.eval_target = y_test
self.fitted_models = {}
self.best_models = {}
self.cv_predictions = {}
self.load_all_models()
def save_model(self, model, model_name):
joblib.dump(model, os.path.join(MODEL_DIR, f"{model_name}.pkl"))
# Save CV predictions if they exist
if model_name in self.cv_predictions:
joblib.dump(
self.cv_predictions[model_name],
os.path.join(MODEL_DIR, f"{model_name}_cv_preds.pkl"),
)
# Save benchmark results
if model_name in self.benchmark_results:
joblib.dump(
self.benchmark_results[model_name],
os.path.join(MODEL_DIR, f"{model_name}_benchmark_results.pkl"),
)
def load_model(self, model_name):
model_path = os.path.join(MODEL_DIR, f"{model_name}.pkl")
cv_preds_path = os.path.join(MODEL_DIR, f"{model_name}_cv_preds.pkl")
benchmark_results_path = os.path.join(
MODEL_DIR, f"{model_name}_benchmark_results.pkl"
)
if os.path.exists(model_path):
model = joblib.load(model_path)
if os.path.exists(cv_preds_path):
cv_preds = joblib.load(cv_preds_path)
self.cv_predictions[model_name] = cv_preds
if os.path.exists(benchmark_results_path):
benchmark_results = joblib.load(benchmark_results_path)
self.benchmark_results[model_name] = benchmark_results
return model
else:
return None
def load_all_models(self):
for name in self.models:
model = self.load_model(name)
if model:
print(f"Loaded saved model for {name}")
self.fitted_models[name] = model
def get_benchmark_results(self):
return self.benchmark_results
def fit_models(self, cv):
for name, model in self.models.items():
if name in self.fitted_models:
print(f"Skipping training for {name} as it is already loaded")
continue
pipeline = self.create_pipeline(model)
# Prefix the parameters with the step name 'model'
grid_search_params = {
f"model__{param}": values
for param, values in self.param_grid[name].items()
}
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=grid_search_params,
cv=cv,
scoring="accuracy",
n_jobs=-1,
verbose=1,
)
grid_search.fit(self.X, self.y)
print(f"Best parameters for {name}: {grid_search.best_params_}")
best_pipeline = grid_search.best_estimator_
self.save_model(best_pipeline, name)
print(f"Saved model for {name}")
metrics = {
"roc_auc": "roc_auc",
"precision": "precision",
"recall": "recall",
"accuracy": "accuracy",
"f1": make_scorer(fbeta_score, beta=1),
"kappa": make_scorer(cohen_kappa_score),
"mcc": make_scorer(matthews_corrcoef),
}
self.benchmark_results[name] = {}
all_cv_preds = np.zeros(len(self.y))
results = cross_validate(
best_pipeline,
self.X,
self.y,
cv=cv,
scoring=metrics,
return_estimator=True,
n_jobs=-1,
verbose=0,
)
for metric_name in metrics.keys():
self.benchmark_results[name][metric_name] = np.mean(
results["test_" + metric_name]
)
print(
f"{name}: {metric_name} = {np.mean(results['test_' + metric_name]):.2f}"
)
for train_idx, test_idx in cv.split(self.X, self.y):
best_pipeline.fit(self.X.iloc[train_idx], self.y.iloc[train_idx])
all_cv_preds[test_idx] = best_pipeline.predict_proba(
self.X.iloc[test_idx]
)[:, 1]
self.cv_predictions[name] = all_cv_preds
self.fitted_models[name] = best_pipeline.fit(self.X, self.y)
# Save the CV predictions after fitting
self.save_model(best_pipeline, name)
def evaluate_models(self):
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
if not self.fitted_models:
self.fit_models(cv)
else:
# Check if there are any models that need to be trained
models_to_train = [
name for name in self.models if name not in self.fitted_models
]
if models_to_train:
for name in models_to_train:
pipeline = self.create_pipeline(self.models[name])
# Prefix the parameters with the step name 'model'
grid_search_params = {
f"model__{param}": values
for param, values in self.param_grid[name].items()
}
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=grid_search_params,
cv=cv,
scoring="accuracy",
n_jobs=-1,
verbose=1,
)
grid_search.fit(self.X, self.y)
print(f"Best parameters for {name}: {grid_search.best_params_}")
best_pipeline = grid_search.best_estimator_
self.save_model(best_pipeline, name)
print(f"Saved model for {name}")
metrics = {
"roc_auc": "roc_auc",
"precision": "precision",
"recall": "recall",
"accuracy": "accuracy",
"f1": make_scorer(fbeta_score, beta=1),
"kappa": make_scorer(cohen_kappa_score),
"mcc": make_scorer(matthews_corrcoef),
}
self.benchmark_results[name] = {}
all_cv_preds = np.zeros(len(self.y))
results = cross_validate(
best_pipeline,
self.X,
self.y,
cv=cv,
scoring=metrics,
return_estimator=True,
n_jobs=-1,
verbose=0,
)
for metric_name in metrics.keys():
self.benchmark_results[name][metric_name] = np.mean(
results["test_" + metric_name]
)
print(
f"{name}: {metric_name} = {np.mean(results['test_' + metric_name]):.2f}"
)
for train_idx, test_idx in cv.split(self.X, self.y):
best_pipeline.fit(
self.X.iloc[train_idx], self.y.iloc[train_idx]
)
all_cv_preds[test_idx] = best_pipeline.predict_proba(
self.X.iloc[test_idx]
)[:, 1]
self.cv_predictions[name] = all_cv_preds
self.fitted_models[name] = best_pipeline.fit(self.X, self.y)
# Save the CV predictions after fitting
self.save_model(best_pipeline, name)
return self.benchmark_results
def plot_roc_curves(self):
if not self.fitted_models:
self.evaluate_models()
plt.figure(figsize=(10, 8))
for name in self.fitted_models:
if name not in self.cv_predictions:
print(f"Warning: No CV predictions for {name}")
continue
y_scores = self.cv_predictions[name]
fpr, tpr, _ = roc_curve(self.y, y_scores)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"{name} (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.show()
def create_pipeline(self, model):
categorical_cols = self.X.select_dtypes(include=["category", "object"]).columns
numeric_cols = self.X.select_dtypes(include=["int64", "float64"]).columns
numeric_transformer = Pipeline(
[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)
categorical_transformer = Pipeline(
[
("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
[
("num", numeric_transformer, numeric_cols),
("cat", categorical_transformer, categorical_cols),
]
)
return Pipeline([("preprocessor", preprocessor), ("model", model)])
def compare_top_n_customers(self, model_name, n=100):
print(f"Comparing top {n} customers for {model_name}")
model = self.fitted_models[model_name]
probabilities = model.predict_proba(self.eval_data)[:, 1]
predictions = model.predict(self.eval_data)
top_n_indices = np.argsort(probabilities)[::-1][:n]
plt.figure()
plt.hist(probabilities[top_n_indices], bins=20, alpha=0.75)
plt.title(f"Histogram of top {n} customers' probabilities for {model_name}")
plt.xlabel("Probability")
plt.ylabel("Frequency")
plt.show()
data = {"predictions": predictions, "probabilities": probabilities}
return pd.DataFrame(data)
def plot_confusion_matrices(self):
if not self.fitted_models:
self.evaluate_models()
for name, model in self.fitted_models.items():
plt.style.use("default")
y_pred = model.predict(self.X)
cm = confusion_matrix(self.y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title(f"Confusion Matrix for {name}")
plt.show()
plt.style.use("ggplot")
class MetricsBenchmarker:
def __init__(self):
"""
Initialize the benchmarker with models and data.
:param models: dict of (name, model) pairs
:param X: Feature matrix
:param y: Target vector
:param selected_fields: Fields selected for training
"""
self.benchmark_results = {}
self.evals = []
def add_evaluator(self, evaluator: ModelEvaluator):
self.evals.append(evaluator)
def set_benchmark_results(self):
for eval in self.evals:
self.benchmark_results.update(eval.get_benchmark_results())
def display_benchmark_results_table(self):
"""
Display a table of benchmark results.
"""
results_df = pd.DataFrame(self.benchmark_results).T
display(results_df)
def plot_benchmark_results_bar_chart(self):
"""
Plot a bar chart of benchmark results.
"""
results_df = pd.DataFrame(self.benchmark_results).T
results_df.plot(kind="bar", figsize=(10, 6))
plt.title("Benchmark Results")
plt.ylabel("Score")
plt.show()model_predictions = {}
# Example usage
from sklearn.linear_model import LogisticRegression
# Define models and their parameter grids
models = {
"Baseline Logistic Regression": LogisticRegression(solver="liblinear"),
}
param_grid = {
"Baseline Logistic Regression": {"C": [0.01, 0.1, 1, 10]},
}
selected_fields = (
["age"]
+ [col for col in X_train.columns if "gender" in col]
+ [col for col in X_train.columns if "region_client" in col]
+ [f"volume_{i}" for i in range(1, 14)]
+ [f"balance_{i}" for i in range(1, 14)]
)
evaluator_baseline = ModelEvaluator(
models,
param_grid,
X_train,
y_train,
X_test,
y_test,
selected_fields=selected_fields,
)
evaluator_baseline.evaluate_models()
evaluator_baseline.plot_roc_curves()
evaluator_baseline.plot_confusion_matrices()
predictions = evaluator_baseline.compare_top_n_customers(
"Baseline Logistic Regression", n=100
)
predictions["account_id"] = X_test.index.values
model_predictions["Baseline Logistic Regression"] = predictionsLoaded saved model for Baseline Logistic Regression
Comparing top 100 customers for Baseline Logistic Regression



# Define models and their parameter grids
models = {
"Logistic Regression Features": LogisticRegression(solver="liblinear"),
}
param_grid = {
"Logistic Regression Features": {"C": [0.01, 0.1, 1, 10]},
}
selected_fields = X_train_features.columns
evaluator = ModelEvaluator(
models,
param_grid,
X_train_features,
y_train_features,
X_test_features,
y_test_features,
selected_fields=selected_fields,
)
evaluator.evaluate_models()
evaluator.plot_roc_curves()
predictions = evaluator.compare_top_n_customers("Logistic Regression Features", n=100)
predictions["account_id"] = X_test.index.values
model_predictions["Logistic Regression Features"] = predictionsLoaded saved model for Logistic Regression Features
Comparing top 100 customers for Logistic Regression Features


Overfitting because of jagged ROC curve needs Regularization
# Define models and their parameter grids
models = {
"Logistic Regression Features added": LogisticRegression(solver="liblinear"),
}
param_grid = {
"Logistic Regression Features added": {"C": [0.001, 0.01, 0.1, 1, 10]},
}
selected_fields = X_train_features.columns
# Fix the not converging models with LassoCV
# for model_name, model in models.items():
# models[model_name] = Pipeline(
# [
# ("scaler", StandardScaler()),
# (
# "feature_selection",
# SelectFromModel(LassoCV(alphas=[0.01, 0.1, 1, 10], max_iter=10000)),
# ),
# ("model", model),
# ]
# )
evaluator = ModelEvaluator(
models,
param_grid,
X_train_features,
y_train_features,
X_test_features,
y_test_features,
selected_fields=selected_fields,
)
evaluator.evaluate_models()
evaluator.plot_roc_curves()
predictions = evaluator.compare_top_n_customers(
"Logistic Regression Features added", n=100
)
predictions["account_id"] = X_test.index.values
model_predictions["Logistic Regression Features added"] = predictionsLoaded saved model for Logistic Regression Features added
Comparing top 100 customers for Logistic Regression Features added


import joblib
import os
from sklearn.ensemble import BaggingClassifier, StackingClassifier
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
AdaBoostClassifier,
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Define base models for stacking
base_models = [
("rf", RandomForestClassifier(n_estimators=100)),
("gb", GradientBoostingClassifier(n_estimators=100)),
("svc", make_pipeline(StandardScaler(), SVC(probability=True))),
]
# Define models and their parameter grids
models = {
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier(),
"SVM": SVC(probability=True),
"KNN": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"AdaBoost": AdaBoostClassifier(algorithm="SAMME"),
"Bagging": BaggingClassifier(estimator=DecisionTreeClassifier()),
"Stacking": StackingClassifier(
estimators=base_models, final_estimator=LogisticRegression()
),
}
# Full parameter grid
param_grid_full = {
"Random Forest": {
"n_estimators": [100, 200, 300, 400],
"max_depth": [None, 5, 10, 20],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"bootstrap": [True, False],
},
"Gradient Boosting": {
"n_estimators": [100, 300],
"learning_rate": [0.1, 0.001],
"max_depth": [3, 6],
"min_samples_split": [2, 10],
"min_samples_leaf": [1, 4],
"subsample": [0.8, 1.0],
},
"SVM": {
"C": [0.1],
"kernel": ["linear"],
"gamma": ["scale"],
"degree": [3, 4, 5],
},
"KNN": {
"n_neighbors": [3, 5, 9],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
"leaf_size": [20, 40],
},
"Decision Tree": {
"max_depth": [None, 5, 20],
"min_samples_split": [2, 5, 20],
"min_samples_leaf": [1, 2, 4],
"criterion": ["gini", "entropy"],
},
"AdaBoost": {
"n_estimators": [50, 100, 200],
"learning_rate": [1.0, 0.1, 0.01],
"algorithm": ["SAMME", "SAMME.R"],
},
"Bagging": {
"n_estimators": [10, 100],
"max_samples": [0.5, 1.0],
"max_features": [0.5, 1.0],
"bootstrap": [True, False],
"bootstrap_features": [True, False],
},
"Stacking": {
"final_estimator__C": [0.1, 1, 10, 100],
"final_estimator__penalty": ["l2", "none"],
},
}
# Debug parameter grid
param_grid_debug = {
"Random Forest": {
"n_estimators": [100],
"max_depth": [5],
},
"Gradient Boosting": {
"n_estimators": [100],
"learning_rate": [0.1],
},
"SVM": {
"C": [1],
"kernel": ["linear"],
},
"KNN": {
"n_neighbors": [3],
"weights": ["uniform"],
},
"Decision Tree": {
"max_depth": [5],
},
"AdaBoost": {
"n_estimators": [50],
"learning_rate": [1.0],
},
"Bagging": {
"n_estimators": [10],
},
"Stacking": {
"final_estimator__C": [1],
},
}
# Select parameter grid based on debug mode
param_grid = param_grid_debug if debug_mode else param_grid_full
selected_fields = X_train_features.columns # add the new features of df_features
evaluator_models = ModelEvaluator(
models,
param_grid,
X_train_features,
y_train_features,
X_test_features,
y_test_features,
selected_fields=selected_fields,
)
results = evaluator_models.evaluate_models()
evaluator_models.plot_roc_curves()Loaded saved model for Random Forest
Loaded saved model for Gradient Boosting
Loaded saved model for SVM
Loaded saved model for KNN
Loaded saved model for Decision Tree
Loaded saved model for AdaBoost
Loaded saved model for Bagging
Loaded saved model for Stacking

Vergleich der Top-N Kundenlisten
import matplotlib.pyplot as plt
import seaborn as sns
def compare_top_customers(predictions, percentage):
top_n = {}
for model, pred_df in predictions.items():
n = int(
len(pred_df) * percentage / 100
) # calculate the number of top customers based on the percentage
top_customers = pred_df.nlargest(n, "probabilities")["account_id"]
top_n[model] = set(top_customers)
model_names = list(predictions.keys())
overlaps = pd.DataFrame(0, index=model_names, columns=model_names).astype(float)
for i in range(len(model_names)):
for j in range(i, len(model_names)):
model1, model2 = model_names[i], model_names[j]
overlap = len(top_n[model1].intersection(top_n[model2])) / n
overlaps.loc[model1, model2] = overlap
overlaps.loc[model2, model1] = overlap
plt.figure(figsize=(10, 10))
sns.heatmap(overlaps, annot=True, cmap="viridis", fmt=".1f")
plt.title(f"Overlap of Top {percentage}% Customers Between Models")
plt.show()
return overlaps# compare top n customers for all models
for model_name in models.keys():
predictions = evaluator_models.compare_top_n_customers(model_name, n=100)
predictions["account_id"] = X_test.index.values
model_predictions[model_name] = predictionsComparing top 100 customers for Random Forest
Comparing top 100 customers for Gradient Boosting
Comparing top 100 customers for SVM
Comparing top 100 customers for KNN
Comparing top 100 customers for Decision Tree
Comparing top 100 customers for AdaBoost
Comparing top 100 customers for Bagging
Comparing top 100 customers for Stacking








compare_top_customers(model_predictions, 5)
| Baseline Logistic Regression | Logistic Regression Features | Logistic Regression Features added | Random Forest | Gradient Boosting | SVM | KNN | Decision Tree | AdaBoost | Bagging | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Baseline Logistic Regression | 1.000000 | 0.533333 | 0.533333 | 0.133333 | 0.000000 | 0.200000 | 0.200000 | 0.200000 | 0.000000 | 0.133333 | 0.133333 |
| Logistic Regression Features | 0.533333 | 1.000000 | 1.000000 | 0.400000 | 0.333333 | 0.400000 | 0.266667 | 0.133333 | 0.266667 | 0.333333 | 0.400000 |
| Logistic Regression Features added | 0.533333 | 1.000000 | 1.000000 | 0.400000 | 0.333333 | 0.400000 | 0.266667 | 0.133333 | 0.266667 | 0.333333 | 0.400000 |
| Random Forest | 0.133333 | 0.400000 | 0.400000 | 1.000000 | 0.333333 | 0.333333 | 0.200000 | 0.133333 | 0.133333 | 0.333333 | 0.400000 |
| Gradient Boosting | 0.000000 | 0.333333 | 0.333333 | 0.333333 | 1.000000 | 0.400000 | 0.133333 | 0.066667 | 0.466667 | 0.333333 | 0.666667 |
| SVM | 0.200000 | 0.400000 | 0.400000 | 0.333333 | 0.400000 | 1.000000 | 0.066667 | 0.066667 | 0.266667 | 0.200000 | 0.400000 |
| KNN | 0.200000 | 0.266667 | 0.266667 | 0.200000 | 0.133333 | 0.066667 | 1.000000 | 0.333333 | 0.066667 | 0.533333 | 0.066667 |
| Decision Tree | 0.200000 | 0.133333 | 0.133333 | 0.133333 | 0.066667 | 0.066667 | 0.333333 | 1.000000 | 0.066667 | 0.200000 | 0.066667 |
| AdaBoost | 0.000000 | 0.266667 | 0.266667 | 0.133333 | 0.466667 | 0.266667 | 0.066667 | 0.066667 | 1.000000 | 0.266667 | 0.466667 |
| Bagging | 0.133333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.200000 | 0.533333 | 0.200000 | 0.266667 | 1.000000 | 0.333333 |
| Stacking | 0.133333 | 0.400000 | 0.400000 | 0.400000 | 0.666667 | 0.400000 | 0.066667 | 0.066667 | 0.466667 | 0.333333 | 1.000000 |
Die oben gezeigte Matrix zeigt die Überlappung von der top 5% account_ids, welche von dem jeweiligen Modell als potentieller Kreditkartenkäufer identifiziert wurde. Allgemein sinkt die Übereinstimmung im Vergleich zu den top 10% deutlich. Die logistischen Regressionsmodelle weisen noch eine Übereinstimmung von 53% auf, gleich wie das AdaBoost und Gradient Boosting Modell. Die höchste Überlappung der Resultate liefert das Gradient Boosting Modell und das Random Forest Modell mit 66%.
compare_top_customers(model_predictions, 10)
| Baseline Logistic Regression | Logistic Regression Features | Logistic Regression Features added | Random Forest | Gradient Boosting | SVM | KNN | Decision Tree | AdaBoost | Bagging | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Baseline Logistic Regression | 1.000000 | 0.580645 | 0.580645 | 0.258065 | 0.225806 | 0.419355 | 0.161290 | 0.225806 | 0.290323 | 0.290323 | 0.225806 |
| Logistic Regression Features | 0.580645 | 1.000000 | 1.000000 | 0.419355 | 0.451613 | 0.580645 | 0.225806 | 0.258065 | 0.419355 | 0.419355 | 0.483871 |
| Logistic Regression Features added | 0.580645 | 1.000000 | 1.000000 | 0.419355 | 0.451613 | 0.580645 | 0.225806 | 0.258065 | 0.419355 | 0.419355 | 0.483871 |
| Random Forest | 0.258065 | 0.419355 | 0.419355 | 1.000000 | 0.580645 | 0.451613 | 0.225806 | 0.225806 | 0.516129 | 0.451613 | 0.774194 |
| Gradient Boosting | 0.225806 | 0.451613 | 0.451613 | 0.580645 | 1.000000 | 0.419355 | 0.258065 | 0.258065 | 0.612903 | 0.419355 | 0.580645 |
| SVM | 0.419355 | 0.580645 | 0.580645 | 0.451613 | 0.419355 | 1.000000 | 0.258065 | 0.258065 | 0.354839 | 0.387097 | 0.451613 |
| KNN | 0.161290 | 0.225806 | 0.225806 | 0.225806 | 0.258065 | 0.258065 | 1.000000 | 0.580645 | 0.193548 | 0.483871 | 0.193548 |
| Decision Tree | 0.225806 | 0.258065 | 0.258065 | 0.225806 | 0.258065 | 0.258065 | 0.580645 | 1.000000 | 0.193548 | 0.483871 | 0.225806 |
| AdaBoost | 0.290323 | 0.419355 | 0.419355 | 0.516129 | 0.612903 | 0.354839 | 0.193548 | 0.193548 | 1.000000 | 0.419355 | 0.483871 |
| Bagging | 0.290323 | 0.419355 | 0.419355 | 0.451613 | 0.419355 | 0.387097 | 0.483871 | 0.483871 | 0.419355 | 1.000000 | 0.419355 |
| Stacking | 0.225806 | 0.483871 | 0.483871 | 0.774194 | 0.580645 | 0.451613 | 0.193548 | 0.225806 | 0.483871 | 0.419355 | 1.000000 |
Die oben gezeigte Matrix zeigt die Überlappung von der top 10% der account_ids, welche von dem jeweiligen Modell als potentieller Kreditkartenkäufer identifiziert wurde. Die höchste Überschneidung haben die Resultate des Gradient Boosting und das AdaBoost Modells. Weitere Modellkombinationen mit mehr als 50% Übereinstimmung sind die logistischen regressions Modelle untereinander, wie auch im Vergleich zum SVM Modell. Andere Modellkombinationen überschneiden sich weniger als 50% in der Top-N Liste.
Results Comparision
benchmark = MetricsBenchmarker()
benchmark.add_evaluator(evaluator_baseline)
benchmark.add_evaluator(evaluator_models)
benchmark.add_evaluator(evaluator)
benchmark.set_benchmark_results()
benchmark.display_benchmark_results_table()
benchmark.plot_benchmark_results_bar_chart()| roc_auc | precision | recall | accuracy | f1 | kappa | mcc | |
|---|---|---|---|---|---|---|---|
| Baseline Logistic Regression | 0.891229 | 0.812613 | 0.853744 | 0.828068 | 0.832209 | 0.656139 | 0.657884 |
| Random Forest | 0.961725 | 0.848434 | 0.939615 | 0.885604 | 0.891552 | 0.771194 | 0.776030 |
| Gradient Boosting | 0.963185 | 0.852799 | 0.948151 | 0.891998 | 0.897833 | 0.783988 | 0.789204 |
| SVM | 0.938046 | 0.837849 | 0.928992 | 0.874256 | 0.880687 | 0.748489 | 0.753907 |
| KNN | 0.869455 | 0.798212 | 0.818891 | 0.805698 | 0.808039 | 0.611391 | 0.612247 |
| Decision Tree | 0.878287 | 0.804879 | 0.911920 | 0.845106 | 0.854784 | 0.690188 | 0.697052 |
| AdaBoost | 0.943421 | 0.840905 | 0.912655 | 0.869639 | 0.875037 | 0.739273 | 0.742559 |
| Bagging | 0.934821 | 0.850945 | 0.884230 | 0.864308 | 0.866979 | 0.728609 | 0.729710 |
| Stacking | 0.960045 | 0.859402 | 0.915476 | 0.882417 | 0.886269 | 0.764823 | 0.767041 |
| Logistic Regression Features added | 0.927750 | 0.852780 | 0.870066 | 0.859345 | 0.860630 | 0.718679 | 0.720132 |

# best model
# Define weights for the metrics
weights = {"roc_auc": 0.25, "precision": 0.25, "recall": 0.25, "f1": 0.25}
# Calculate weighted scores for each model
weighted_scores = {}
for model, metrics in benchmark.benchmark_results.items():
weighted_score = sum(
weights[metric] * score
for metric, score in metrics.items()
if metric in weights
)
weighted_scores[model] = weighted_score
# Find the best model based on weighted score
best_model_name = max(weighted_scores, key=weighted_scores.get)
best_model_score = weighted_scores[best_model_name]7. Model Assesment
# plot pipeline
from sklearn import set_config
best_model = evaluator_models.fitted_models[best_model_name]
print(f"Best model: {best_model_name}")
set_config(display="diagram")
best_modelBest model: Gradient Boosting
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
Index(['date_account', 'age', 'amount', 'duration', 'payments', 'household',
'insurance_payment', 'leasing', 'loan_payment', 'unknown',
...
'n_transactions_min', 'n_transactions_max', 'n_transactions_m...
'withdrawal_mean_ratio_last3_first3', 'withdrawal_sum'],
dtype='object', length=298)),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index([], dtype='object'))])),
('model',
GradientBoostingClassifier(max_depth=6, min_samples_leaf=4,
min_samples_split=10,
n_estimators=300, subsample=0.8))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
Index(['date_account', 'age', 'amount', 'duration', 'payments', 'household',
'insurance_payment', 'leasing', 'loan_payment', 'unknown',
...
'n_transactions_min', 'n_transactions_max', 'n_transactions_m...
'withdrawal_mean_ratio_last3_first3', 'withdrawal_sum'],
dtype='object', length=298)),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index([], dtype='object'))])),
('model',
GradientBoostingClassifier(max_depth=6, min_samples_leaf=4,
min_samples_split=10,
n_estimators=300, subsample=0.8))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler', StandardScaler())]),
Index(['date_account', 'age', 'amount', 'duration', 'payments', 'household',
'insurance_payment', 'leasing', 'loan_payment', 'unknown',
...
'n_transactions_min', 'n_transactions_max', 'n_transactions_mad',
'n_transactions_mean_ratio_last3_first3', 'withdrawal_mean',
'withdrawal_min', 'withdrawal_max', 'withdrawal_mad',
'withdrawal_mean_ratio_last3_first3', 'withdrawal_sum'],
dtype='object', length=298)),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index([], dtype='object'))])Index(['date_account', 'age', 'amount', 'duration', 'payments', 'household',
'insurance_payment', 'leasing', 'loan_payment', 'unknown',
...
'n_transactions_min', 'n_transactions_max', 'n_transactions_mad',
'n_transactions_mean_ratio_last3_first3', 'withdrawal_mean',
'withdrawal_min', 'withdrawal_max', 'withdrawal_mad',
'withdrawal_mean_ratio_last3_first3', 'withdrawal_sum'],
dtype='object', length=298)SimpleImputer(strategy='median')
StandardScaler()
Index([], dtype='object')
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore')
GradientBoostingClassifier(max_depth=6, min_samples_leaf=4,
min_samples_split=10, n_estimators=300,
subsample=0.8)# Evaluate the best model with the test set
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
cohen_kappa_score,
matthews_corrcoef,
)
y_pred = best_model.predict(X_test_features)
print(f"Evaluation of the best model ({best_model_name}) using X_test:")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title(f"Confusion Matrix for {best_model_name} on X_test")
plt.show()
# ROC Curve
y_scores = best_model.predict_proba(X_test_features)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f"{best_model_name} (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve for {best_model_name} on X_test")
plt.legend(loc="lower right")
plt.show()
# Other Metrics
accuracy = accuracy_score(y_test_features, y_pred)
precision = precision_score(y_test_features, y_pred)
recall = recall_score(y_test_features, y_pred)
f1 = f1_score(y_test_features, y_pred)
kappa = cohen_kappa_score(y_test_features, y_pred)
mcc = matthews_corrcoef(y_test_features, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Cohen Kappa: {kappa:.2f}")
print(f"Matthews Correlation Coefficient: {mcc:.2f}")Evaluation of the best model (Gradient Boosting) using X_test:
Accuracy: 0.92
Precision: 0.87
Recall: 0.98
F1 Score: 0.92
Cohen Kappa: 0.83
Matthews Correlation Coefficient: 0.84


8. Model Erklärbarkeit
Random Forest Tree
# PLot Random Forest Tree
from sklearn.tree import plot_tree
# get the rf model but its not the best model get it with the name
rf_model = evaluator_models.fitted_models["Random Forest"].named_steps["model"]
print(rf_model)
# rf_model = best_model.named_steps["model"]
# plot rf_model pipeline
plt.figure(figsize=(20, 10))
plot_tree(rf_model.estimators_[0], feature_names=X_test_features.columns, filled=True)
plt.show()RandomForestClassifier(bootstrap=False, n_estimators=200)

Random Forest Feature Importance
# Assuming 'rf_model' is your trained RandomForest model and 'X_test_features' is your feature matrix
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_test_features.columns[indices]
# Filter features with importance greater than a threshold (e.g., 0.01)
threshold = 0.01
important_indices = indices[importances[indices] > threshold]
important_features = features[importances[indices] > threshold]
important_importances = importances[important_indices]
# Create the plot
plt.figure(figsize=(14, 10))
sns.barplot(
x=important_importances,
y=important_features,
palette="viridis",
hue=important_features,
dodge=False,
legend=False,
)
# Adjust the aesthetics
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
# Show the plot
plt.show()
Best Model Feature Explain
# Import necessary libraries
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt
import random
# Ensure your test data is a DataFrame
X_test_features_df = pd.DataFrame(X_test_features, columns=X_test_features.columns)
# Create LIME explainer using the training data of the best model
explainer = LimeTabularExplainer(
training_data=X_train_features.values,
feature_names=X_train_features.columns,
class_names=["No Card", "Card"],
mode="classification",
)
# Function to predict probabilities with column names
def predict_proba_with_names(X):
X_df = pd.DataFrame(X, columns=X_test_features.columns)
return best_model.predict_proba(X_df)
# Randomly select 10 instances from the test data
random.seed(42)
random_indices = random.sample(range(len(X_test_features_df)), 10)
# Generate and display LIME explanations for the 10 random instances
for idx in random_indices:
instance_to_explain = X_test_features_df.iloc[idx]
explanation = explainer.explain_instance(
data_row=instance_to_explain.values, predict_fn=predict_proba_with_names
)
print(f"Explanation for instance {idx}:")
explanation.show_in_notebook(show_table=True, show_all=False)
# Optional: Save each explanation as an HTML file
# explanation.save_to_file(f'lime_explanation_{idx}.html')Explanation for instance 57:
Explanation for instance 12:
Explanation for instance 140:
Explanation for instance 125:
Explanation for instance 114:
Explanation for instance 71:
Explanation for instance 52:
Explanation for instance 279:
Explanation for instance 44:
Explanation for instance 302:
Reduziere Modell für Erklärbarkeit
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import lime.lime_tabular
import shap
import statsmodels.api as sm
# Ensure all data in X_train_features are numeric
X_train_features = X_train_features.apply(pd.to_numeric, errors="coerce")
# Drop rows with any NaN values in X_train_features and align y_res
X_train_features = X_train_features.dropna()
y_res_aligned = y_res.loc[X_train_features.index]
X_train_reduced = X_train_features.copy()
X_test_reduced = X_test_features.copy()
y_test_reduced = y_test_features.copy()
y_train_reduced = y_train_features.copy()
# Convert to DataFrame with feature names for consistency
X_train_reduced = pd.DataFrame(X_train_reduced, columns=X_train_features.columns)
X_test_reduced = pd.DataFrame(X_test_reduced, columns=X_train_features.columns)
# Add a constant column for the intercept
X_train_reduced = sm.add_constant(X_train_reduced)
X_test_reduced = sm.add_constant(X_test_reduced)
# Convert to numeric to avoid dtype issues
X_train_reduced = X_train_reduced.apply(pd.to_numeric)
X_test_reduced = X_test_reduced.apply(pd.to_numeric)
# Apply Lasso (L1) regularization for feature selection
lasso_model = LogisticRegressionCV(
cv=5, penalty="l1", solver="liblinear", random_state=42, Cs=np.logspace(-4, 0, 50)
)
lasso_model.fit(X_train_reduced, y_train_reduced)
# Get the features with non-zero coefficients
coef = pd.Series(lasso_model.coef_[0], index=X_train_reduced.columns)
selected_features = coef[coef != 0].index.tolist()
print("Selected features after Lasso:", selected_features)
lasso_selected_fields = selected_featuresSelected features after Lasso: ['date_account', 'household', 'leasing', 'unknown', 'num_of_bigger_town_account', 'num_of_bigger_town_client', 'num_of_city_client', 'balance_1', 'balance_2', 'balance_3', 'balance_4', 'balance_5', 'balance_6', 'balance_7', 'balance_9', 'balance_10', 'balance_13', 'n_transactions_2', 'n_transactions_3', 'n_transactions_10', 'n_transactions_12', 'withdrawal_8', 'frequency_weekly', 'gender_male', 'status_running contract', 'status_none', 'district_name_account_Blansko', 'district_name_account_Bruntal', 'district_name_account_Chrudim', 'district_name_account_Hl.m. Praha', 'district_name_account_Hodonin', 'district_name_account_Hradec Kralove', 'district_name_account_Jicin', 'district_name_account_Liberec', 'district_name_account_Nachod', 'district_name_account_Novy Jicin', 'district_name_account_Nymburk', 'district_name_account_Olomouc', 'district_name_account_Pardubice', 'district_name_account_Pribram', 'district_name_account_Rokycany', 'district_name_account_Sokolov', 'district_name_account_Strakonice', 'district_name_account_Sumperk', 'district_name_account_Svitavy', 'district_name_account_Usti nad Orlici', 'region_account_central Bohemia', 'region_account_east Bohemia', 'region_account_north Bohemia', 'region_account_north Moravia', 'region_account_south Moravia', 'region_account_west Bohemia', 'district_name_client_Beroun', 'district_name_client_Brno - mesto', 'district_name_client_Ceske Budejovice', 'district_name_client_Cheb', 'district_name_client_Domazlice', 'district_name_client_Frydek - Mistek', 'district_name_client_Hl.m. Praha', 'district_name_client_Jihlava', 'district_name_client_Karvina', 'district_name_client_Kutna Hora', 'district_name_client_Litomerice', 'district_name_client_Opava', 'district_name_client_Pisek', 'district_name_client_Plzen - jih', 'district_name_client_Prachatice', 'district_name_client_Prostejov', 'district_name_client_Trutnov', 'district_name_client_Vyskov', 'district_name_client_Zdar nad Sazavou', 'region_client_central Bohemia', 'region_client_north Bohemia', 'region_client_north Moravia', 'region_client_south Bohemia', 'region_client_south Moravia', 'region_client_west Bohemia', 'balance_mean_ratio_last3_first3', 'credit_max', 'credit_mean_ratio_last3_first3', 'credit_std', 'n_transactions_mean', 'withdrawal_mean_ratio_last3_first3']
Logistic Regression Reduziert
Wir machen kein Model Selection mehr sondern nur ein Model Assesment für Explainable AI
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import lime.lime_tabular
import shap
# Step 1: Create and Train a Reduced Model
selected_fields_reduced = lasso_selected_fields
# Ensure all data in selected fields are numeric
X_feature_engineered[selected_fields_reduced] = X_feature_engineered[
selected_fields_reduced
].apply(pd.to_numeric)
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(
X_feature_engineered[selected_fields_reduced],
y_res,
test_size=0.1,
random_state=42,
stratify=y_res,
)
# Convert to DataFrame with feature names for consistency
X_train_reduced = pd.DataFrame(X_train_reduced, columns=selected_fields_reduced)
X_test_reduced = pd.DataFrame(X_test_reduced, columns=selected_fields_reduced)
reduced_model = LogisticRegression(solver="liblinear")
reduced_model.fit(X_train_reduced, y_train_reduced)
y_pred_reduced = reduced_model.predict(X_test_reduced)
# plt theme style to standard for better visualization
plt.style.use("default")
cm = confusion_matrix(y_test_reduced, y_pred_reduced)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Reduced Model")
plt.show()
plt.style.use("ggplot")
y_scores_reduced = reduced_model.predict_proba(X_test_reduced)[:, 1]
fpr, tpr, _ = roc_curve(y_test_reduced, y_scores_reduced)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f"Reduced Model (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Reduced Model")
plt.legend(loc="lower right")
plt.show()
# Step 2: Create a Wrapper Function for LIME
def predict_proba_with_feature_names(X):
X_df = pd.DataFrame(X, columns=selected_fields_reduced)
return reduced_model.predict_proba(X_df)
# Explain the Model with LIME
explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X_train_reduced.values,
feature_names=selected_fields_reduced,
class_names=["No Card", "Card"],
mode="classification",
)
for i in range(2):
exp = explainer.explain_instance(
data_row=X_test_reduced.iloc[i].values,
predict_fn=predict_proba_with_feature_names,
)
exp.show_in_notebook(show_table=True)

# Step 3: Explain the Model with SHAP
# Summarize the background data using shap.sample
background_data = shap.sample(X_train_reduced, 100)
explainer_shap = shap.KernelExplainer(reduced_model.predict_proba, background_data)
shap_values = explainer_shap.shap_values(X_test_reduced)
shap.initjs()
# Print shapes to debug
print("SHAP values shape:", np.array(shap_values).shape)
print("X_test_reduced shape:", X_test_reduced.shape)
# Verify dimensions (consider removing the extra dimension if it exists)
instance_index = 0 # Change the instance index if needed
positive_class_index = 1
if len(shap_values.shape) > 2: # Check for extra dimension
shap_values = shap_values[
:, :, positive_class_index
] # Select positive class values
assert len(shap_values[instance_index]) == X_test_reduced.shape[1], "Dimension mismatch"
# Use only the SHAP values for the positive class (index 1)
shap.force_plot(
explainer_shap.expected_value[positive_class_index],
shap_values[instance_index],
X_test_reduced.iloc[instance_index],
)
shap.summary_plot(shap_values, X_test_reduced, feature_names=selected_fields_reduced)SHAP values shape: (313, 83, 2)
X_test_reduced shape: (313, 83)

Erstellung eines Modelles, welches weder District noch Gender diskriminierung betreibt.
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import lime.lime_tabular
import shap
# Step 1: Create and Train a Reduced Model
selected_fields_reduced = lasso_selected_fields
# drop the columns that are discriminating
# Präfixe, die entfernt werden sollen
discriminating_prefixes = ["gender", "district", "region"]
# Funktion zum Entfernen der diskriminierenden Spalten
def remove_discriminating_columns(columns, prefixes):
return [
col for col in columns if not any(col.startswith(prefix) for prefix in prefixes)
]
# Neue Liste der Spalten ohne diskriminierende Spalten
selected_fields_reduced = remove_discriminating_columns(
selected_fields_reduced, discriminating_prefixes
)
# Ausgabe der gefilterten Liste
print(selected_fields_reduced)
# Ensure all data in selected fields are numeric
X_feature_engineered[selected_fields_reduced] = X_feature_engineered[
selected_fields_reduced
].apply(pd.to_numeric)
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(
X_feature_engineered[selected_fields_reduced],
y_res,
test_size=0.1,
random_state=42,
stratify=y_res,
)
# Convert to DataFrame with feature names for consistency
X_train_reduced = pd.DataFrame(X_train_reduced, columns=selected_fields_reduced)
X_test_reduced = pd.DataFrame(X_test_reduced, columns=selected_fields_reduced)
reduced_model = LogisticRegression(solver="liblinear")
reduced_model.fit(X_train_reduced, y_train_reduced)
plt.style.use("default")
y_pred_reduced = reduced_model.predict(X_test_reduced)
cm = confusion_matrix(y_test_reduced, y_pred_reduced)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Reduced Model")
plt.show()
# Change the style back to ggplot
plt.style.use("ggplot")
y_scores_reduced = reduced_model.predict_proba(X_test_reduced)[:, 1]
fpr, tpr, _ = roc_curve(y_test_reduced, y_scores_reduced)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f"Reduced Model (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Reduced Model")
plt.legend(loc="lower right")
plt.show()
# Step 2: Create a Wrapper Function for LIME
def predict_proba_with_feature_names(X):
X_df = pd.DataFrame(X, columns=selected_fields_reduced)
return reduced_model.predict_proba(X_df)
# Explain the Model with LIME
explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X_train_reduced.values,
feature_names=selected_fields_reduced,
class_names=["No Card", "Card"],
mode="classification",
)
for i in range(2):
exp = explainer.explain_instance(
data_row=X_test_reduced.iloc[i].values,
predict_fn=predict_proba_with_feature_names,
)
exp.show_in_notebook(show_table=True)['date_account', 'household', 'leasing', 'unknown', 'num_of_bigger_town_account', 'num_of_bigger_town_client', 'num_of_city_client', 'balance_1', 'balance_2', 'balance_3', 'balance_4', 'balance_5', 'balance_6', 'balance_7', 'balance_9', 'balance_10', 'balance_13', 'n_transactions_2', 'n_transactions_3', 'n_transactions_10', 'n_transactions_12', 'withdrawal_8', 'frequency_weekly', 'status_running contract', 'status_none', 'balance_mean_ratio_last3_first3', 'credit_max', 'credit_mean_ratio_last3_first3', 'credit_std', 'n_transactions_mean', 'withdrawal_mean_ratio_last3_first3']


# Step 3: Explain the Model with SHAP
# Summarize the background data using shap.sample
background_data = shap.sample(X_train_reduced, 100)
explainer_shap = shap.KernelExplainer(reduced_model.predict_proba, background_data)
shap_values = explainer_shap.shap_values(X_test_reduced)
shap.initjs()
# Print shapes to debug
print("SHAP values shape:", np.array(shap_values).shape)
print("X_test_reduced shape:", X_test_reduced.shape)
# Verify dimensions (consider removing the extra dimension if it exists)
instance_index = 0 # Change the instance index if needed
positive_class_index = 1
if len(shap_values.shape) > 2: # Check for extra dimension
shap_values = shap_values[
:, :, positive_class_index
] # Select positive class values
assert len(shap_values[instance_index]) == X_test_reduced.shape[1], "Dimension mismatch"
# Use only the SHAP values for the positive class (index 1)
shap.force_plot(
explainer_shap.expected_value[positive_class_index],
shap_values[instance_index],
X_test_reduced.iloc[instance_index],
)
shap.summary_plot(shap_values, X_test_reduced, feature_names=selected_fields_reduced)
# export to htmlInterpretation von den Resultaten
- todo
9. Beschreiben des Mehrwerts des “finalen” Modelles in der Praxis
Convert Notebook
import subprocess
import pathlib
import os
try:
file_path = pathlib.Path(os.path.basename(__file__))
except:
file_path = pathlib.Path("AML_MC.ipynb")
# Check the file extension
if file_path.suffix == ".qmd":
# If it's a Python script, convert it to a notebook
try:
os.system("quarto convert AML_MC.qmd")
print("Converted to notebook.")
except subprocess.CalledProcessError as e:
print("Conversion failed. Error message:", e.output)
elif file_path.suffix == ".ipynb":
# If it's a notebook, convert it to a Python script with cell markers
try:
# quatro convert ipynb to qmd
os.system("quarto convert AML_MC.ipynb")
print("Converted to qmd.")
except subprocess.CalledProcessError as e:
print("Conversion failed. Error message:", e.output)
else:
print("Unsupported file type.")import os
os.system("quarto render AML_MC.ipynb --to html")